def test_two_groupings(self): """Tests two groupings of input sequences in which the first grouping has two sequences and the second grouping has one sequence. Note that this test is dependent on the default values for generating candidate probes: probe length of 100 bp with a stride of 50 bp. """ seqs = [[genome.Genome.from_one_seq('A' * 200), genome.Genome.from_one_seq('B' * 150)], [genome.Genome.from_one_seq('C' * 300)]] desired_candidate_probes = \ ['A' * 100, 'A' * 100, 'A' * 100, 'B' * 100, 'B' * 100, 'C' * 100, 'C' * 100, 'C' * 100, 'C' * 100, 'C' * 100] desired_candidate_probes = \ [probe.Probe.from_str(s) for s in desired_candidate_probes] desired_final_probes = ['A' * 100, 'B' * 100, 'C' * 100] desired_final_probes = \ [probe.Probe.from_str(s) for s in desired_final_probes] df = duplicate_filter.DuplicateFilter() pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=100, probe_stride=50) pb.design() self.assertEqual(pb.candidate_probes, desired_candidate_probes) self.assertEqual(pb.final_probes, desired_final_probes)
def test_basic(self): input = ['ATCGTCGCGG', 'ATCGTAGCGG', 'ATCGTCACGG', 'ATCGTAGCGG', 'ATTGTCGCGG', 'ATCGTCGCGG'] desired_output = ['ATCGTCGCGG', 'ATCGTAGCGG', 'ATCGTCACGG', 'ATTGTCGCGG'] input_probes = [probe.Probe.from_str(s) for s in input] desired_output_probes = [probe.Probe.from_str(s) for s in desired_output] f = duplicate_filter.DuplicateFilter() f.filter(input_probes) self.assertCountEqual(f.input_probes, input_probes) # Order should be preserved, so use assertEqual rather than # assertCountEqual self.assertEqual(f.output_probes, desired_output_probes)
def test_one_filter1(self): """A basic test with a duplicate filter and one input sequence. Note that this test is dependent on the default values for generating candidate probes: probe length of 100 bp with a stride of 50 bp. """ seqs = [ [genome.Genome.from_one_seq('A' * 100 + 'B' * 100 + 'A' * 100)]] desired_candidate_probes = \ ['A' * 100, 'A' * 50 + 'B' * 50, 'B' * 100, 'B' * 50 + 'A' * 50, 'A' * 100] desired_candidate_probes = \ [probe.Probe.from_str(s) for s in desired_candidate_probes] desired_final_probes = ['A' * 100, 'A' * 50 + 'B' * 50, 'B' * 100, 'B' * 50 + 'A' * 50] desired_final_probes = \ [probe.Probe.from_str(s) for s in desired_final_probes] df = duplicate_filter.DuplicateFilter() pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=100, probe_stride=50) pb.design() self.assertEqual(pb.candidate_probes, desired_candidate_probes) self.assertEqual(pb.final_probes, desired_final_probes)
def test_with_small_sequences(self): """A test with a duplicate filter and input sequences that are smaller than the probe length. """ seqs = [[genome.Genome.from_one_seq('ABCDEFGHIJKLMN'), genome.Genome.from_one_seq('ABCDEFGHIXKLMN'), genome.Genome.from_one_seq('XYZAB')]] desired_candidate_probes = \ ['ABCDEF', 'DEFGHI', 'GHIJKL', 'IJKLMN', 'ABCDEF', 'DEFGHI', 'GHIXKL', 'IXKLMN', 'XYZAB'] desired_candidate_probes = \ [probe.Probe.from_str(s) for s in desired_candidate_probes] desired_final_probes = ['ABCDEF', 'DEFGHI', 'GHIJKL', 'IJKLMN', 'GHIXKL', 'IXKLMN', 'XYZAB'] desired_final_probes = \ [probe.Probe.from_str(s) for s in desired_final_probes] df = duplicate_filter.DuplicateFilter() pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=6, probe_stride=3, allow_small_seqs=5) pb.design() self.assertEqual(pb.candidate_probes, desired_candidate_probes) self.assertEqual(pb.final_probes, desired_final_probes)
def test_one_filter2(self): """A basic test with a duplicate filter and one input sequence. Note that this test uses a probe length of 75 bp and a stride of 25 bp. """ seqs = [ [genome.Genome.from_one_seq('A' * 100 + 'B' * 100 + 'A' * 100)]] desired_candidate_probes = \ ['A' * 75, 'A' * 75, 'A' * 50 + 'B' * 25, 'A' * 25 + 'B' * 50, 'B' * 75, 'B' * 75, 'B' * 50 + 'A' * 25, 'B' * 25 + 'A' * 50, 'A' * 75, 'A' * 75] desired_candidate_probes = \ [probe.Probe.from_str(s) for s in desired_candidate_probes] desired_final_probes = ['A' * 75, 'A' * 50 + 'B' * 25, 'A' * 25 + 'B' * 50, 'B' * 75, 'B' * 50 + 'A' * 25, 'B' * 25 + 'A' * 50] desired_final_probes = \ [probe.Probe.from_str(s) for s in desired_final_probes] df = duplicate_filter.DuplicateFilter() pb = probe_designer.ProbeDesigner(seqs, [df], probe_length=75, probe_stride=25) pb.design() self.assertEqual(pb.candidate_probes, desired_candidate_probes) self.assertEqual(pb.final_probes, desired_final_probes)
def main(args): # Read the FASTA sequences ds = args.dataset try: if os.path.isfile(ds): # Process a custom fasta file with sequences seqs = [seq_io.read_genomes_from_fasta(ds)] else: dataset = importlib.import_module( 'catch.datasets.' + ds) seqs = [seq_io.read_dataset_genomes(dataset)] except ImportError: raise ValueError("Unknown file or dataset '%s'" % ds) if (args.limit_target_genomes and args.limit_target_genomes_randomly_with_replacement): raise Exception(("Cannot --limit-target-genomes and " "--limit-target-genomes-randomly-with-replacement at " "the same time")) elif args.limit_target_genomes: seqs = [genomes[:args.limit_target_genomes] for genomes in seqs] elif args.limit_target_genomes_randomly_with_replacement: k = args.limit_target_genomes_randomly_with_replacement seqs = [random.choices(genomes, k=k) for genomes in seqs] # Setup the filters needed for replication filters = [] # The filters we use are, in order: # Duplicate filter (df) -- condense all candidate probes that # are identical down to one; this is not necessary for # correctness, as the naive redundant filter achieves the same # task implicitly, but it does significantly lower runtime by # decreasing the input size to the naive redundant filter df = duplicate_filter.DuplicateFilter() filters += [df] if args.naive_redundant_filter and args.dominating_set_filter: raise Exception(("Cannot use both 'naive_redundant_filter' and " "'dominating_set_filter' at the same time. (You could " "of course do one after the other, but it was probably " "a mistake to specify both.)")) elif args.naive_redundant_filter or args.dominating_set_filter: if args.naive_redundant_filter: # Naive redundant filter -- execute a greedy algorithm to # condense 'similar' probes down to one mismatches, lcf_thres = args.naive_redundant_filter filt_class = naive_redundant_filter.NaiveRedundantFilter if args.dominating_set_filter: # Dominating set filter (dsf) -- construct a graph where each # node is a probe and edges connect 'similar' probes; then # approximate the smallest dominating set mismatches, lcf_thres = args.dominating_set_filter filt_class = dominating_set_filter.DominatingSetFilter # Construct a function to determine whether two probes are # redundant, and then instantiate the appropriate filter redundant_fn = naive_redundant_filter.redundant_longest_common_substring( mismatches, lcf_thres) filt = filt_class(redundant_fn) filters += [filt] if args.add_reverse_complements: # Reverse complement (rc) -- add the reverse complement of # each probe as a candidate rc = reverse_complement_filter.ReverseComplementFilter() filters += [rc] # Design the probes pb = probe_designer.ProbeDesigner(seqs, filters, probe_length=args.probe_length, probe_stride=args.probe_stride) pb.design() if args.print_analysis: if args.naive_redundant_filter or args.dominating_set_filter: mismatch_thres = mismatches else: mismatch_thres = 0 analyzer = coverage_analysis.Analyzer(pb.final_probes, mismatch_thres, args.probe_length, seqs, [args.dataset]) analyzer.run() analyzer.print_analysis() else: # Just print the number of probes print(len(pb.final_probes))
def main(args): logger = logging.getLogger(__name__) # Set NCBI API key if args.ncbi_api_key: ncbi_neighbors.ncbi_api_key = args.ncbi_api_key # Read the genomes from FASTA sequences genomes_grouped = [] genomes_grouped_names = [] for ds in args.dataset: if ds.startswith('collection:'): # Process a collection of datasets collection_name = ds[len('collection:'):] try: collection = importlib.import_module( 'catch.datasets.collections.' + collection_name) except ImportError: raise ValueError("Unknown dataset collection %s" % collection_name) for name, dataset in collection.import_all(): genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [name] elif ds.startswith('download:'): # Download a FASTA for an NCBI taxonomic ID taxid = ds[len('download:'):] if args.write_taxid_acc: taxid_fn = os.path.join(args.write_taxid_acc, str(taxid) + '.txt') else: taxid_fn = None if '-' in taxid: taxid, segment = taxid.split('-') else: segment = None ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid( taxid, segment=segment, write_to=taxid_fn) genomes_grouped += [ seq_io.read_genomes_from_fasta(ds_fasta_tf.name) ] genomes_grouped_names += ['taxid:' + str(taxid)] ds_fasta_tf.close() elif os.path.isfile(ds): # Process a custom fasta file with sequences genomes_grouped += [seq_io.read_genomes_from_fasta(ds)] genomes_grouped_names += [os.path.basename(ds)] else: # Process an individual dataset try: dataset = importlib.import_module('catch.datasets.' + ds) except ImportError: raise ValueError("Unknown file or dataset '%s'" % ds) genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [ds] if (args.limit_target_genomes and args.limit_target_genomes_randomly_with_replacement): raise Exception(("Cannot --limit-target-genomes and " "--limit-target-genomes-randomly-with-replacement at " "the same time")) elif args.limit_target_genomes: genomes_grouped = [ genomes[:args.limit_target_genomes] for genomes in genomes_grouped ] elif args.limit_target_genomes_randomly_with_replacement: k = args.limit_target_genomes_randomly_with_replacement genomes_grouped = [ random.choices(genomes, k=k) for genomes in genomes_grouped ] # Store the FASTA paths of blacklisted genomes blacklisted_genomes_fasta = [] if args.blacklist_genomes: for bg in args.blacklist_genomes: if os.path.isfile(bg): # Process a custom fasta file with sequences blacklisted_genomes_fasta += [bg] else: # Process an individual dataset try: dataset = importlib.import_module('catch.datasets.' + bg) except ImportError: raise ValueError("Unknown file or dataset '%s'" % bg) for fp in dataset.fasta_paths: blacklisted_genomes_fasta += [fp] # Setup and verify parameters related to probe length if not args.lcf_thres: args.lcf_thres = args.probe_length if args.probe_stride > args.probe_length: logger.warning(("PROBE_STRIDE (%d) is greater than PROBE_LENGTH " "(%d), which is usually undesirable and may lead " "to undefined behavior"), args.probe_stride, args.probe_length) if args.lcf_thres > args.probe_length: logger.warning(("LCF_THRES (%d) is greater than PROBE_LENGTH " "(%d), which is usually undesirable and may lead " "to undefined behavior"), args.lcf_thres, args.probe_length) if args.island_of_exact_match > args.probe_length: logger.warning(("ISLAND_OF_EXACT_MATCH (%d) is greater than " "PROBE_LENGTH (%d), which is usually undesirable " "and may lead to undefined behavior"), args.island_of_exact_match, args.probe_length) # Setup and verify parameters related to k-mer length in probe map if args.kmer_probe_map_k: # Check that k is sufficiently small if args.kmer_probe_map_k > args.probe_length: raise Exception(("KMER_PROBE_MAP_K (%d) exceeds PROBE_LENGTH " "(%d), which is not permitted") % (args.kmer_probe_map_k, args.probe_length)) # Use this value for the SetCoverFilter, AdapterFilter, and # the Analyzer kmer_probe_map_k_scf = args.kmer_probe_map_k kmer_probe_map_k_af = args.kmer_probe_map_k kmer_probe_map_k_analyzer = args.kmer_probe_map_k else: if args.probe_length <= 20: logger.warning(("PROBE_LENGTH (%d) is small; you may want to " "consider setting --kmer-probe-map-k to be " "small as well in order to be more sensitive " "in mapping candidate probes to target sequence"), args.probe_length) # Use a default k of 20 for the SetCoverFilter and AdapterFilter, # and 10 for the Analyzer since we would like to be more sensitive # (potentially at the cost of slower runtime) for the latter kmer_probe_map_k_scf = 20 kmer_probe_map_k_af = 20 kmer_probe_map_k_analyzer = 10 # Set the maximum number of processes in multiprocessing pools if args.max_num_processes: probe.set_max_num_processes_for_probe_finding_pools( args.max_num_processes) cluster.set_max_num_processes_for_creating_distance_matrix( args.max_num_processes) # Raise exceptions or warn based on use of adapter arguments if args.add_adapters: if not (args.adapter_a or args.adapter_b): logger.warning(("Adapter sequences will be added, but default " "sequences will be used; to provide adapter " "sequences, use --adapter-a and --adapter-b")) else: if args.adapter_a or args.adapter_b: raise Exception( ("Adapter sequences were provided with " "--adapter-a and --adapter-b, but --add-adapters is required " "to add adapter sequences onto the ends of probes")) # Do not allow both --small-seq-skip and --small-seq-min, since they # have different intentions if args.small_seq_skip is not None and args.small_seq_min is not None: raise Exception(("Both --small-seq-skip and --small-seq-min were " "specified, but both cannot be used together")) # Check arguments involving clustering if args.cluster_and_design_separately and args.identify: raise Exception( ("Cannot use --cluster-and-design-separately with " "--identify, because clustering collapses genome groupings into " "one")) if args.cluster_from_fragments and not args.cluster_and_design_separately: raise Exception(("Cannot use --cluster-from-fragments without also " "setting --cluster-and-design-separately")) # Check for whether a custom hybridization function was provided if args.custom_hybridization_fn: custom_cover_range_fn = tuple(args.custom_hybridization_fn) else: custom_cover_range_fn = None if args.custom_hybridization_fn_tolerant: custom_cover_range_tolerant_fn = tuple( args.custom_hybridization_fn_tolerant) else: custom_cover_range_tolerant_fn = None # Setup the filters # The filters we use are, in order: filters = [] # [Optional] # Fasta filter (ff) -- leave out candidate probes if args.filter_from_fasta: ff = fasta_filter.FastaFilter(args.filter_from_fasta, skip_reverse_complements=True) filters += [ff] # [Optional] # Poly(A) filter (paf) -- leave out probes with stretches of 'A' or 'T' if args.filter_polya: polya_length, polya_mismatches = args.filter_polya if polya_length > args.probe_length: logger.warning(("Length of poly(A) stretch to filter (%d) is " "greater than PROBE_LENGTH (%d), which is usually " "undesirable"), polya_length, args.probe_length) if polya_length < 10: logger.warning(("Length of poly(A) stretch to filter (%d) is " "short, and may lead to many probes being " "filtered"), polya_length) if polya_mismatches > 10: logger.warning(("Number of mismatches to tolerate when searching " "for poly(A) stretches (%d) is high, and may " "lead to many probes being filtered"), polya_mismatches) paf = polya_filter.PolyAFilter(polya_length, polya_mismatches) filters += [paf] # Duplicate filter (df) -- condense all candidate probes that # are identical down to one; this is not necessary for # correctness, as the set cover filter achieves the same task # implicitly, but it does significantly lower runtime by # decreasing the input size to the set cover filter # Near duplicate filter (ndf) -- condense candidate probes that # are near-duplicates down to one using locality-sensitive # hashing; like the duplicate filter, this is not necessary # but can significantly lower runtime and reduce memory usage # (even more than the duplicate filter) if (args.filter_with_lsh_hamming is not None and args.filter_with_lsh_minhash is not None): raise Exception(("Cannot use both --filter-with-lsh-hamming " "and --filter-with-lsh-minhash")) if args.filter_with_lsh_hamming is not None: if args.filter_with_lsh_hamming > args.mismatches: logger.warning( ("Setting FILTER_WITH_LSH_HAMMING (%d) to be greater " "than MISMATCHES (%d) may cause the probes to achieve less " "than the desired coverage"), args.filter_with_lsh_hamming, args.mismatches) ndf = near_duplicate_filter.NearDuplicateFilterWithHammingDistance( args.filter_with_lsh_hamming, args.probe_length) filters += [ndf] elif args.filter_with_lsh_minhash is not None: ndf = near_duplicate_filter.NearDuplicateFilterWithMinHash( args.filter_with_lsh_minhash) filters += [ndf] else: df = duplicate_filter.DuplicateFilter() filters += [df] # Set cover filter (scf) -- solve the problem by treating it as # an instance of the set cover problem scf = set_cover_filter.SetCoverFilter( mismatches=args.mismatches, lcf_thres=args.lcf_thres, island_of_exact_match=args.island_of_exact_match, mismatches_tolerant=args.mismatches_tolerant, lcf_thres_tolerant=args.lcf_thres_tolerant, island_of_exact_match_tolerant=args.island_of_exact_match_tolerant, custom_cover_range_fn=custom_cover_range_fn, custom_cover_range_tolerant_fn=custom_cover_range_tolerant_fn, identify=args.identify, blacklisted_genomes=blacklisted_genomes_fasta, coverage=args.coverage, cover_extension=args.cover_extension, cover_groupings_separately=args.cover_groupings_separately, kmer_probe_map_k=kmer_probe_map_k_scf, kmer_probe_map_use_native_dict=args. use_native_dict_when_finding_tolerant_coverage) filters += [scf] # [Optional] # Adapter filter (af) -- add adapters to both the 5' and 3' ends # of each probe if args.add_adapters: # Set default adapter sequences, if not provided if args.adapter_a: adapter_a = tuple(args.adapter_a) else: adapter_a = ('ATACGCCATGCTGGGTCTCC', 'CGTACTTGGGAGTCGGCCAT') if args.adapter_b: adapter_b = tuple(args.adapter_b) else: adapter_b = ('AGGCCCTGGCTGCTGATATG', 'GACCTTTTGGGACAGCGGTG') af = adapter_filter.AdapterFilter(adapter_a, adapter_b, mismatches=args.mismatches, lcf_thres=args.lcf_thres, island_of_exact_match=\ args.island_of_exact_match, custom_cover_range_fn=\ custom_cover_range_fn, kmer_probe_map_k=kmer_probe_map_k_af) filters += [af] # [Optional] # N expansion filter (nef) -- expand Ns in probe sequences # to avoid ambiguity if args.expand_n is not None: nef = n_expansion_filter.NExpansionFilter( limit_n_expansion_randomly=args.expand_n) filters += [nef] # [Optional] # Reverse complement (rc) -- add the reverse complement of each # probe that remains if args.add_reverse_complements: rc = reverse_complement_filter.ReverseComplementFilter() filters += [rc] # If requested, don't apply the set cover filter if args.skip_set_cover: filter_before_scf = filters[filters.index(scf) - 1] filters.remove(scf) # Define parameters for clustering sequences if args.cluster_and_design_separately: cluster_threshold = args.cluster_and_design_separately if args.skip_set_cover: cluster_merge_after = filter_before_scf else: cluster_merge_after = scf cluster_fragment_length = args.cluster_from_fragments else: cluster_threshold = None cluster_merge_after = None cluster_fragment_length = None # Design the probes pb = probe_designer.ProbeDesigner( genomes_grouped, filters, probe_length=args.probe_length, probe_stride=args.probe_stride, allow_small_seqs=args.small_seq_min, seq_length_to_skip=args.small_seq_skip, cluster_threshold=cluster_threshold, cluster_merge_after=cluster_merge_after, cluster_fragment_length=cluster_fragment_length) pb.design() # Write the final probes to the file args.output_probes seq_io.write_probe_fasta(pb.final_probes, args.output_probes) if (args.print_analysis or args.write_analysis_to_tsv or args.write_sliding_window_coverage or args.write_probe_map_counts_to_tsv): analyzer = coverage_analysis.Analyzer( pb.final_probes, args.mismatches, args.lcf_thres, genomes_grouped, genomes_grouped_names, island_of_exact_match=args.island_of_exact_match, custom_cover_range_fn=custom_cover_range_fn, cover_extension=args.cover_extension, kmer_probe_map_k=kmer_probe_map_k_analyzer, rc_too=args.add_reverse_complements) analyzer.run() if args.write_analysis_to_tsv: analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv) if args.write_sliding_window_coverage: analyzer.write_sliding_window_coverage( args.write_sliding_window_coverage) if args.write_probe_map_counts_to_tsv: analyzer.write_probe_map_counts(args.write_probe_map_counts_to_tsv) if args.print_analysis: analyzer.print_analysis() else: # Just print the number of probes print(len(pb.final_probes))