Exemple #1
0
 def test_two_groupings(self):
     """Tests two groupings of input sequences in which the first
     grouping has two sequences and the second grouping has one
     sequence.
     Note that this test is dependent on the default values for
     generating candidate probes: probe length of 100 bp with a stride
     of 50 bp.
     """
     seqs = [[genome.Genome.from_one_seq('A' * 200),
              genome.Genome.from_one_seq('B' * 150)],
             [genome.Genome.from_one_seq('C' * 300)]]
     desired_candidate_probes = \
         ['A' * 100, 'A' * 100, 'A' * 100, 'B' * 100, 'B' * 100,
          'C' * 100, 'C' * 100, 'C' * 100, 'C' * 100, 'C' * 100]
     desired_candidate_probes = \
         [probe.Probe.from_str(s) for s in desired_candidate_probes]
     desired_final_probes = ['A' * 100, 'B' * 100, 'C' * 100]
     desired_final_probes = \
         [probe.Probe.from_str(s) for s in desired_final_probes]
     df = duplicate_filter.DuplicateFilter()
     pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=100,
         probe_stride=50)
     pb.design()
     self.assertEqual(pb.candidate_probes, desired_candidate_probes)
     self.assertEqual(pb.final_probes, desired_final_probes)
Exemple #2
0
 def test_basic(self):
     input = ['ATCGTCGCGG', 'ATCGTAGCGG', 'ATCGTCACGG', 'ATCGTAGCGG',
              'ATTGTCGCGG', 'ATCGTCGCGG']
     desired_output = ['ATCGTCGCGG', 'ATCGTAGCGG', 'ATCGTCACGG',
                       'ATTGTCGCGG']
     input_probes = [probe.Probe.from_str(s) for s in input]
     desired_output_probes = [probe.Probe.from_str(s)
                              for s in desired_output]
     f = duplicate_filter.DuplicateFilter()
     f.filter(input_probes)
     self.assertCountEqual(f.input_probes, input_probes)
     # Order should be preserved, so use assertEqual rather than
     # assertCountEqual
     self.assertEqual(f.output_probes, desired_output_probes)
Exemple #3
0
 def test_one_filter1(self):
     """A basic test with a duplicate filter and one input sequence.
     Note that this test is dependent on the default values for
     generating candidate probes: probe length of 100 bp with a stride
     of 50 bp.
     """
     seqs = [
         [genome.Genome.from_one_seq('A' * 100 + 'B' * 100 + 'A' * 100)]]
     desired_candidate_probes = \
         ['A' * 100, 'A' * 50 + 'B' * 50, 'B' * 100, 'B' * 50 + 'A' * 50,
          'A' * 100]
     desired_candidate_probes = \
         [probe.Probe.from_str(s) for s in desired_candidate_probes]
     desired_final_probes = ['A' * 100, 'A' * 50 + 'B' * 50, 'B' * 100,
                             'B' * 50 + 'A' * 50]
     desired_final_probes = \
         [probe.Probe.from_str(s) for s in desired_final_probes]
     df = duplicate_filter.DuplicateFilter()
     pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=100,
         probe_stride=50)
     pb.design()
     self.assertEqual(pb.candidate_probes, desired_candidate_probes)
     self.assertEqual(pb.final_probes, desired_final_probes)
Exemple #4
0
 def test_with_small_sequences(self):
     """A test with a duplicate filter and input sequences that are smaller
     than the probe length.
     """
     seqs = [[genome.Genome.from_one_seq('ABCDEFGHIJKLMN'),
              genome.Genome.from_one_seq('ABCDEFGHIXKLMN'),
              genome.Genome.from_one_seq('XYZAB')]]
     desired_candidate_probes = \
         ['ABCDEF', 'DEFGHI', 'GHIJKL', 'IJKLMN',
          'ABCDEF', 'DEFGHI', 'GHIXKL', 'IXKLMN',
          'XYZAB']
     desired_candidate_probes = \
         [probe.Probe.from_str(s) for s in desired_candidate_probes]
     desired_final_probes = ['ABCDEF', 'DEFGHI', 'GHIJKL', 'IJKLMN',
                             'GHIXKL', 'IXKLMN', 'XYZAB']
     desired_final_probes = \
         [probe.Probe.from_str(s) for s in desired_final_probes]
     df = duplicate_filter.DuplicateFilter()
     pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=6,
                                       probe_stride=3,
                                       allow_small_seqs=5)
     pb.design()
     self.assertEqual(pb.candidate_probes, desired_candidate_probes)
     self.assertEqual(pb.final_probes, desired_final_probes)
Exemple #5
0
 def test_one_filter2(self):
     """A basic test with a duplicate filter and one input sequence.
     Note that this test uses a probe length of 75 bp and a stride of
     25 bp.
     """
     seqs = [
         [genome.Genome.from_one_seq('A' * 100 + 'B' * 100 + 'A' * 100)]]
     desired_candidate_probes = \
         ['A' * 75, 'A' * 75, 'A' * 50 + 'B' * 25, 'A' * 25 + 'B' * 50,
          'B' * 75, 'B' * 75, 'B' * 50 + 'A' * 25, 'B' * 25 + 'A' * 50,
          'A' * 75, 'A' * 75]
     desired_candidate_probes = \
         [probe.Probe.from_str(s) for s in desired_candidate_probes]
     desired_final_probes = ['A' * 75, 'A' * 50 + 'B' * 25,
                             'A' * 25 + 'B' * 50, 'B' * 75,
                             'B' * 50 + 'A' * 25, 'B' * 25 + 'A' * 50]
     desired_final_probes = \
         [probe.Probe.from_str(s) for s in desired_final_probes]
     df = duplicate_filter.DuplicateFilter()
     pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=75,
         probe_stride=25)
     pb.design()
     self.assertEqual(pb.candidate_probes, desired_candidate_probes)
     self.assertEqual(pb.final_probes, desired_final_probes)
Exemple #6
0
def main(args):
    # Read the FASTA sequences
    ds = args.dataset
    try:
        if os.path.isfile(ds):
            # Process a custom fasta file with sequences
            seqs = [seq_io.read_genomes_from_fasta(ds)]
        else:
            dataset = importlib.import_module(
                'catch.datasets.' + ds)
            seqs = [seq_io.read_dataset_genomes(dataset)]
    except ImportError:
        raise ValueError("Unknown file or dataset '%s'" % ds)

    if (args.limit_target_genomes and
            args.limit_target_genomes_randomly_with_replacement):
        raise Exception(("Cannot --limit-target-genomes and "
                         "--limit-target-genomes-randomly-with-replacement at "
                         "the same time"))
    elif args.limit_target_genomes:
        seqs = [genomes[:args.limit_target_genomes] for genomes in seqs]
    elif args.limit_target_genomes_randomly_with_replacement:
        k = args.limit_target_genomes_randomly_with_replacement
        seqs = [random.choices(genomes, k=k) for genomes in seqs]

    # Setup the filters needed for replication
    filters = []
    # The filters we use are, in order:

    #  Duplicate filter (df) -- condense all candidate probes that
    #  are identical down to one; this is not necessary for
    #  correctness, as the naive redundant filter achieves the same
    #  task implicitly, but it does significantly lower runtime by
    #  decreasing the input size to the naive redundant filter
    df = duplicate_filter.DuplicateFilter()
    filters += [df]

    if args.naive_redundant_filter and args.dominating_set_filter:
        raise Exception(("Cannot use both 'naive_redundant_filter' and "
            "'dominating_set_filter' at the same time. (You could "
            "of course do one after the other, but it was probably "
            "a mistake to specify both.)"))
    elif args.naive_redundant_filter or args.dominating_set_filter:
        if args.naive_redundant_filter:
            # Naive redundant filter -- execute a greedy algorithm to
            # condense 'similar' probes down to one
            mismatches, lcf_thres = args.naive_redundant_filter
            filt_class = naive_redundant_filter.NaiveRedundantFilter
        if args.dominating_set_filter:
            # Dominating set filter (dsf) -- construct a graph where each
            # node is a probe and edges connect 'similar' probes; then
            # approximate the smallest dominating set
            mismatches, lcf_thres = args.dominating_set_filter
            filt_class = dominating_set_filter.DominatingSetFilter
        # Construct a function to determine whether two probes are
        # redundant, and then instantiate the appropriate filter
        redundant_fn = naive_redundant_filter.redundant_longest_common_substring(
                            mismatches, lcf_thres)
        filt = filt_class(redundant_fn)
        filters += [filt]

    if args.add_reverse_complements:
        # Reverse complement (rc) -- add the reverse complement of
        # each probe as a candidate
        rc = reverse_complement_filter.ReverseComplementFilter()
        filters += [rc]

    # Design the probes
    pb = probe_designer.ProbeDesigner(seqs, filters,
                                      probe_length=args.probe_length,
                                      probe_stride=args.probe_stride)
    pb.design()

    if args.print_analysis:
        if args.naive_redundant_filter or args.dominating_set_filter:
            mismatch_thres = mismatches
        else:
            mismatch_thres = 0
        analyzer = coverage_analysis.Analyzer(pb.final_probes,
                                              mismatch_thres,
                                              args.probe_length,
                                              seqs,
                                              [args.dataset])
        analyzer.run()
        analyzer.print_analysis()
    else:
        # Just print the number of probes
        print(len(pb.final_probes))
Exemple #7
0
def main(args):
    logger = logging.getLogger(__name__)

    # Set NCBI API key
    if args.ncbi_api_key:
        ncbi_neighbors.ncbi_api_key = args.ncbi_api_key

    # Read the genomes from FASTA sequences
    genomes_grouped = []
    genomes_grouped_names = []
    for ds in args.dataset:
        if ds.startswith('collection:'):
            # Process a collection of datasets
            collection_name = ds[len('collection:'):]
            try:
                collection = importlib.import_module(
                    'catch.datasets.collections.' + collection_name)
            except ImportError:
                raise ValueError("Unknown dataset collection %s" %
                                 collection_name)
            for name, dataset in collection.import_all():
                genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
                genomes_grouped_names += [name]
        elif ds.startswith('download:'):
            # Download a FASTA for an NCBI taxonomic ID
            taxid = ds[len('download:'):]
            if args.write_taxid_acc:
                taxid_fn = os.path.join(args.write_taxid_acc,
                                        str(taxid) + '.txt')
            else:
                taxid_fn = None
            if '-' in taxid:
                taxid, segment = taxid.split('-')
            else:
                segment = None
            ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid(
                taxid, segment=segment, write_to=taxid_fn)
            genomes_grouped += [
                seq_io.read_genomes_from_fasta(ds_fasta_tf.name)
            ]
            genomes_grouped_names += ['taxid:' + str(taxid)]
            ds_fasta_tf.close()
        elif os.path.isfile(ds):
            # Process a custom fasta file with sequences
            genomes_grouped += [seq_io.read_genomes_from_fasta(ds)]
            genomes_grouped_names += [os.path.basename(ds)]
        else:
            # Process an individual dataset
            try:
                dataset = importlib.import_module('catch.datasets.' + ds)
            except ImportError:
                raise ValueError("Unknown file or dataset '%s'" % ds)
            genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
            genomes_grouped_names += [ds]

    if (args.limit_target_genomes
            and args.limit_target_genomes_randomly_with_replacement):
        raise Exception(("Cannot --limit-target-genomes and "
                         "--limit-target-genomes-randomly-with-replacement at "
                         "the same time"))
    elif args.limit_target_genomes:
        genomes_grouped = [
            genomes[:args.limit_target_genomes] for genomes in genomes_grouped
        ]
    elif args.limit_target_genomes_randomly_with_replacement:
        k = args.limit_target_genomes_randomly_with_replacement
        genomes_grouped = [
            random.choices(genomes, k=k) for genomes in genomes_grouped
        ]

    # Store the FASTA paths of blacklisted genomes
    blacklisted_genomes_fasta = []
    if args.blacklist_genomes:
        for bg in args.blacklist_genomes:
            if os.path.isfile(bg):
                # Process a custom fasta file with sequences
                blacklisted_genomes_fasta += [bg]
            else:
                # Process an individual dataset
                try:
                    dataset = importlib.import_module('catch.datasets.' + bg)
                except ImportError:
                    raise ValueError("Unknown file or dataset '%s'" % bg)
                for fp in dataset.fasta_paths:
                    blacklisted_genomes_fasta += [fp]

    # Setup and verify parameters related to probe length
    if not args.lcf_thres:
        args.lcf_thres = args.probe_length
    if args.probe_stride > args.probe_length:
        logger.warning(("PROBE_STRIDE (%d) is greater than PROBE_LENGTH "
                        "(%d), which is usually undesirable and may lead "
                        "to undefined behavior"), args.probe_stride,
                       args.probe_length)
    if args.lcf_thres > args.probe_length:
        logger.warning(("LCF_THRES (%d) is greater than PROBE_LENGTH "
                        "(%d), which is usually undesirable and may lead "
                        "to undefined behavior"), args.lcf_thres,
                       args.probe_length)
    if args.island_of_exact_match > args.probe_length:
        logger.warning(("ISLAND_OF_EXACT_MATCH (%d) is greater than "
                        "PROBE_LENGTH (%d), which is usually undesirable "
                        "and may lead to undefined behavior"),
                       args.island_of_exact_match, args.probe_length)

    # Setup and verify parameters related to k-mer length in probe map
    if args.kmer_probe_map_k:
        # Check that k is sufficiently small
        if args.kmer_probe_map_k > args.probe_length:
            raise Exception(("KMER_PROBE_MAP_K (%d) exceeds PROBE_LENGTH "
                             "(%d), which is not permitted") %
                            (args.kmer_probe_map_k, args.probe_length))

        # Use this value for the SetCoverFilter, AdapterFilter, and
        # the Analyzer
        kmer_probe_map_k_scf = args.kmer_probe_map_k
        kmer_probe_map_k_af = args.kmer_probe_map_k
        kmer_probe_map_k_analyzer = args.kmer_probe_map_k
    else:
        if args.probe_length <= 20:
            logger.warning(("PROBE_LENGTH (%d) is small; you may want to "
                            "consider setting --kmer-probe-map-k to be "
                            "small as well in order to be more sensitive "
                            "in mapping candidate probes to target sequence"),
                           args.probe_length)

        # Use a default k of 20 for the SetCoverFilter and AdapterFilter,
        # and 10 for the Analyzer since we would like to be more sensitive
        # (potentially at the cost of slower runtime) for the latter
        kmer_probe_map_k_scf = 20
        kmer_probe_map_k_af = 20
        kmer_probe_map_k_analyzer = 10

    # Set the maximum number of processes in multiprocessing pools
    if args.max_num_processes:
        probe.set_max_num_processes_for_probe_finding_pools(
            args.max_num_processes)
        cluster.set_max_num_processes_for_creating_distance_matrix(
            args.max_num_processes)

    # Raise exceptions or warn based on use of adapter arguments
    if args.add_adapters:
        if not (args.adapter_a or args.adapter_b):
            logger.warning(("Adapter sequences will be added, but default "
                            "sequences will be used; to provide adapter "
                            "sequences, use --adapter-a and --adapter-b"))
    else:
        if args.adapter_a or args.adapter_b:
            raise Exception(
                ("Adapter sequences were provided with "
                 "--adapter-a and --adapter-b, but --add-adapters is required "
                 "to add adapter sequences onto the ends of probes"))

    # Do not allow both --small-seq-skip and --small-seq-min, since they
    # have different intentions
    if args.small_seq_skip is not None and args.small_seq_min is not None:
        raise Exception(("Both --small-seq-skip and --small-seq-min were "
                         "specified, but both cannot be used together"))

    # Check arguments involving clustering
    if args.cluster_and_design_separately and args.identify:
        raise Exception(
            ("Cannot use --cluster-and-design-separately with "
             "--identify, because clustering collapses genome groupings into "
             "one"))
    if args.cluster_from_fragments and not args.cluster_and_design_separately:
        raise Exception(("Cannot use --cluster-from-fragments without also "
                         "setting --cluster-and-design-separately"))

    # Check for whether a custom hybridization function was provided
    if args.custom_hybridization_fn:
        custom_cover_range_fn = tuple(args.custom_hybridization_fn)
    else:
        custom_cover_range_fn = None
    if args.custom_hybridization_fn_tolerant:
        custom_cover_range_tolerant_fn = tuple(
            args.custom_hybridization_fn_tolerant)
    else:
        custom_cover_range_tolerant_fn = None

    # Setup the filters
    # The filters we use are, in order:
    filters = []

    # [Optional]
    # Fasta filter (ff) -- leave out candidate probes
    if args.filter_from_fasta:
        ff = fasta_filter.FastaFilter(args.filter_from_fasta,
                                      skip_reverse_complements=True)
        filters += [ff]

    # [Optional]
    # Poly(A) filter (paf) -- leave out probes with stretches of 'A' or 'T'
    if args.filter_polya:
        polya_length, polya_mismatches = args.filter_polya
        if polya_length > args.probe_length:
            logger.warning(("Length of poly(A) stretch to filter (%d) is "
                            "greater than PROBE_LENGTH (%d), which is usually "
                            "undesirable"), polya_length, args.probe_length)
        if polya_length < 10:
            logger.warning(("Length of poly(A) stretch to filter (%d) is "
                            "short, and may lead to many probes being "
                            "filtered"), polya_length)
        if polya_mismatches > 10:
            logger.warning(("Number of mismatches to tolerate when searching "
                            "for poly(A) stretches (%d) is high, and may "
                            "lead to many probes being filtered"),
                           polya_mismatches)
        paf = polya_filter.PolyAFilter(polya_length, polya_mismatches)
        filters += [paf]

    # Duplicate filter (df) -- condense all candidate probes that
    #     are identical down to one; this is not necessary for
    #     correctness, as the set cover filter achieves the same task
    #     implicitly, but it does significantly lower runtime by
    #     decreasing the input size to the set cover filter
    # Near duplicate filter (ndf) -- condense candidate probes that
    #     are near-duplicates down to one using locality-sensitive
    #     hashing; like the duplicate filter, this is not necessary
    #     but can significantly lower runtime and reduce memory usage
    #     (even more than the duplicate filter)
    if (args.filter_with_lsh_hamming is not None
            and args.filter_with_lsh_minhash is not None):
        raise Exception(("Cannot use both --filter-with-lsh-hamming "
                         "and --filter-with-lsh-minhash"))
    if args.filter_with_lsh_hamming is not None:
        if args.filter_with_lsh_hamming > args.mismatches:
            logger.warning(
                ("Setting FILTER_WITH_LSH_HAMMING (%d) to be greater "
                 "than MISMATCHES (%d) may cause the probes to achieve less "
                 "than the desired coverage"), args.filter_with_lsh_hamming,
                args.mismatches)
        ndf = near_duplicate_filter.NearDuplicateFilterWithHammingDistance(
            args.filter_with_lsh_hamming, args.probe_length)
        filters += [ndf]
    elif args.filter_with_lsh_minhash is not None:
        ndf = near_duplicate_filter.NearDuplicateFilterWithMinHash(
            args.filter_with_lsh_minhash)
        filters += [ndf]
    else:
        df = duplicate_filter.DuplicateFilter()
        filters += [df]

    # Set cover filter (scf) -- solve the problem by treating it as
    #     an instance of the set cover problem
    scf = set_cover_filter.SetCoverFilter(
        mismatches=args.mismatches,
        lcf_thres=args.lcf_thres,
        island_of_exact_match=args.island_of_exact_match,
        mismatches_tolerant=args.mismatches_tolerant,
        lcf_thres_tolerant=args.lcf_thres_tolerant,
        island_of_exact_match_tolerant=args.island_of_exact_match_tolerant,
        custom_cover_range_fn=custom_cover_range_fn,
        custom_cover_range_tolerant_fn=custom_cover_range_tolerant_fn,
        identify=args.identify,
        blacklisted_genomes=blacklisted_genomes_fasta,
        coverage=args.coverage,
        cover_extension=args.cover_extension,
        cover_groupings_separately=args.cover_groupings_separately,
        kmer_probe_map_k=kmer_probe_map_k_scf,
        kmer_probe_map_use_native_dict=args.
        use_native_dict_when_finding_tolerant_coverage)
    filters += [scf]

    # [Optional]
    # Adapter filter (af) -- add adapters to both the 5' and 3' ends
    #    of each probe
    if args.add_adapters:
        # Set default adapter sequences, if not provided
        if args.adapter_a:
            adapter_a = tuple(args.adapter_a)
        else:
            adapter_a = ('ATACGCCATGCTGGGTCTCC', 'CGTACTTGGGAGTCGGCCAT')
        if args.adapter_b:
            adapter_b = tuple(args.adapter_b)
        else:
            adapter_b = ('AGGCCCTGGCTGCTGATATG', 'GACCTTTTGGGACAGCGGTG')

        af = adapter_filter.AdapterFilter(adapter_a,
                                          adapter_b,
                                          mismatches=args.mismatches,
                                          lcf_thres=args.lcf_thres,
                                          island_of_exact_match=\
                                            args.island_of_exact_match,
                                          custom_cover_range_fn=\
                                            custom_cover_range_fn,
                                          kmer_probe_map_k=kmer_probe_map_k_af)
        filters += [af]

    # [Optional]
    # N expansion filter (nef) -- expand Ns in probe sequences
    # to avoid ambiguity
    if args.expand_n is not None:
        nef = n_expansion_filter.NExpansionFilter(
            limit_n_expansion_randomly=args.expand_n)
        filters += [nef]

    # [Optional]
    # Reverse complement (rc) -- add the reverse complement of each
    #    probe that remains
    if args.add_reverse_complements:
        rc = reverse_complement_filter.ReverseComplementFilter()
        filters += [rc]

    # If requested, don't apply the set cover filter
    if args.skip_set_cover:
        filter_before_scf = filters[filters.index(scf) - 1]
        filters.remove(scf)

    # Define parameters for clustering sequences
    if args.cluster_and_design_separately:
        cluster_threshold = args.cluster_and_design_separately
        if args.skip_set_cover:
            cluster_merge_after = filter_before_scf
        else:
            cluster_merge_after = scf
        cluster_fragment_length = args.cluster_from_fragments
    else:
        cluster_threshold = None
        cluster_merge_after = None
        cluster_fragment_length = None

    # Design the probes
    pb = probe_designer.ProbeDesigner(
        genomes_grouped,
        filters,
        probe_length=args.probe_length,
        probe_stride=args.probe_stride,
        allow_small_seqs=args.small_seq_min,
        seq_length_to_skip=args.small_seq_skip,
        cluster_threshold=cluster_threshold,
        cluster_merge_after=cluster_merge_after,
        cluster_fragment_length=cluster_fragment_length)
    pb.design()

    # Write the final probes to the file args.output_probes
    seq_io.write_probe_fasta(pb.final_probes, args.output_probes)

    if (args.print_analysis or args.write_analysis_to_tsv
            or args.write_sliding_window_coverage
            or args.write_probe_map_counts_to_tsv):
        analyzer = coverage_analysis.Analyzer(
            pb.final_probes,
            args.mismatches,
            args.lcf_thres,
            genomes_grouped,
            genomes_grouped_names,
            island_of_exact_match=args.island_of_exact_match,
            custom_cover_range_fn=custom_cover_range_fn,
            cover_extension=args.cover_extension,
            kmer_probe_map_k=kmer_probe_map_k_analyzer,
            rc_too=args.add_reverse_complements)
        analyzer.run()
        if args.write_analysis_to_tsv:
            analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv)
        if args.write_sliding_window_coverage:
            analyzer.write_sliding_window_coverage(
                args.write_sliding_window_coverage)
        if args.write_probe_map_counts_to_tsv:
            analyzer.write_probe_map_counts(args.write_probe_map_counts_to_tsv)
        if args.print_analysis:
            analyzer.print_analysis()
    else:
        # Just print the number of probes
        print(len(pb.final_probes))