def main(args): # Read the genomes from FASTA sequences genomes_grouped = [] genomes_grouped_names = [] for ds in args.dataset: if ds.startswith('download:'): # Download a FASTA for an NCBI taxonomic ID taxid = ds[len('download:'):] ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid(taxid) genomes_grouped += [ seq_io.read_genomes_from_fasta(ds_fasta_tf.name) ] genomes_grouped_names += ['taxid:' + str(taxid)] ds_fasta_tf.close() elif os.path.isfile(ds): # Process a custom fasta file with sequences genomes_grouped += [seq_io.read_genomes_from_fasta(ds)] genomes_grouped_names += [os.path.basename(ds)] else: # Process an individual dataset try: dataset = importlib.import_module('catch.datasets.' + ds) except ImportError: raise ValueError("Unknown dataset %s" % ds) genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [ds] if args.limit_target_genomes: genomes_grouped = [ genomes[:args.limit_target_genomes] for genomes in genomes_grouped ] # Set the maximum number of processes in multiprocessing pools if args.max_num_processes: probe.set_max_num_processes_for_probe_finding_pools( args.max_num_processes) # Read the FASTA file of probes fasta = seq_io.read_fasta(args.probes_fasta) probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()] # Run the coverage analyzer analyzer = coverage_analysis.Analyzer( probes, args.mismatches, args.lcf_thres, genomes_grouped, genomes_grouped_names, island_of_exact_match=args.island_of_exact_match, cover_extension=args.cover_extension, kmer_probe_map_k=args.kmer_probe_map_k) analyzer.run() if args.write_analysis_to_tsv: analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv) if args.write_sliding_window_coverage: analyzer.write_sliding_window_coverage( args.write_sliding_window_coverage) if args.print_analysis: analyzer.print_analysis()
def setUp(self): # Disable logging logging.disable(logging.INFO) # Create Analyzer instance with two target genomes genome_a = genome.Genome.from_one_seq('ATCCATCCATNGGGTTTGAAGCG') probes_str = ['ATCCAT', 'TTTGAA', 'GAAGCG'] probes = [probe.Probe.from_str(p) for p in probes_str] self.analyzer = ca.Analyzer(probes, mismatches=0, lcf_thres=6, target_genomes=[[genome_a]], target_genomes_names=["g_a"], cover_extension=2, kmer_probe_map_k=3, rc_too=False) self.analyzer.run(window_length=6, window_stride=3)
def setUp(self): # Disable logging logging.disable(logging.INFO) # Create Analyzer instance with two target genomes genome_a = genome.Genome.from_one_seq('ATCCATCCATNGGGTTTGAAGCG') genome_b = genome.Genome.from_chrs(OrderedDict([('chr1', 'CCCCCC'), ('chr2', 'NTGAAGCG')])) probes_str = ['ATCCAT', 'TTTGAA', 'GAAGCG', 'ATGGAT', 'AAACCC'] probes = [probe.Probe.from_str(p) for p in probes_str] self.analyzer = ca.Analyzer(probes, mismatches=0, lcf_thres=6, target_genomes=[[genome_a], [genome_b]], target_genomes_names=["g_a", "g_b"], kmer_probe_map_k=3) self.analyzer.run(window_length=6, window_stride=3)
def main(args): # Read the genomes from FASTA sequences genomes_grouped = [] genomes_grouped_names = [] for ds in args.dataset: try: dataset = importlib.import_module('catch.datasets.' + ds) except ImportError: raise ValueError("Unknown dataset %s" % ds) genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [ds] if args.limit_target_genomes: genomes_grouped = [ genomes[:args.limit_target_genomes] for genomes in genomes_grouped ] # Set the maximum number of processes in multiprocessing pools if args.max_num_processes: probe.set_max_num_processes_for_probe_finding_pools( args.max_num_processes) # Read the FASTA file of probes fasta = seq_io.read_fasta(args.probes_fasta) probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()] # Run the coverage analyzer analyzer = coverage_analysis.Analyzer( probes, args.mismatches, args.lcf_thres, genomes_grouped, genomes_grouped_names, island_of_exact_match=args.island_of_exact_match, cover_extension=args.cover_extension, kmer_probe_map_k=args.kmer_probe_map_k) analyzer.run() if args.write_analysis_to_tsv: analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv) if args.write_sliding_window_coverage: analyzer.write_sliding_window_coverage( args.write_sliding_window_coverage) if args.print_analysis: analyzer.print_analysis()
def main(args): # Read the FASTA sequences ds = args.dataset try: if os.path.isfile(ds): # Process a custom fasta file with sequences seqs = [seq_io.read_genomes_from_fasta(ds)] else: dataset = importlib.import_module( 'catch.datasets.' + ds) seqs = [seq_io.read_dataset_genomes(dataset)] except ImportError: raise ValueError("Unknown file or dataset '%s'" % ds) if (args.limit_target_genomes and args.limit_target_genomes_randomly_with_replacement): raise Exception(("Cannot --limit-target-genomes and " "--limit-target-genomes-randomly-with-replacement at " "the same time")) elif args.limit_target_genomes: seqs = [genomes[:args.limit_target_genomes] for genomes in seqs] elif args.limit_target_genomes_randomly_with_replacement: k = args.limit_target_genomes_randomly_with_replacement seqs = [random.choices(genomes, k=k) for genomes in seqs] # Setup the filters needed for replication filters = [] # The filters we use are, in order: # Duplicate filter (df) -- condense all candidate probes that # are identical down to one; this is not necessary for # correctness, as the naive redundant filter achieves the same # task implicitly, but it does significantly lower runtime by # decreasing the input size to the naive redundant filter df = duplicate_filter.DuplicateFilter() filters += [df] if args.naive_redundant_filter and args.dominating_set_filter: raise Exception(("Cannot use both 'naive_redundant_filter' and " "'dominating_set_filter' at the same time. (You could " "of course do one after the other, but it was probably " "a mistake to specify both.)")) elif args.naive_redundant_filter or args.dominating_set_filter: if args.naive_redundant_filter: # Naive redundant filter -- execute a greedy algorithm to # condense 'similar' probes down to one mismatches, lcf_thres = args.naive_redundant_filter filt_class = naive_redundant_filter.NaiveRedundantFilter if args.dominating_set_filter: # Dominating set filter (dsf) -- construct a graph where each # node is a probe and edges connect 'similar' probes; then # approximate the smallest dominating set mismatches, lcf_thres = args.dominating_set_filter filt_class = dominating_set_filter.DominatingSetFilter # Construct a function to determine whether two probes are # redundant, and then instantiate the appropriate filter redundant_fn = naive_redundant_filter.redundant_longest_common_substring( mismatches, lcf_thres) filt = filt_class(redundant_fn) filters += [filt] if args.add_reverse_complements: # Reverse complement (rc) -- add the reverse complement of # each probe as a candidate rc = reverse_complement_filter.ReverseComplementFilter() filters += [rc] # Design the probes pb = probe_designer.ProbeDesigner(seqs, filters, probe_length=args.probe_length, probe_stride=args.probe_stride) pb.design() if args.print_analysis: if args.naive_redundant_filter or args.dominating_set_filter: mismatch_thres = mismatches else: mismatch_thres = 0 analyzer = coverage_analysis.Analyzer(pb.final_probes, mismatch_thres, args.probe_length, seqs, [args.dataset]) analyzer.run() analyzer.print_analysis() else: # Just print the number of probes print(len(pb.final_probes))
def main(args): logger = logging.getLogger(__name__) # Set NCBI API key if args.ncbi_api_key: ncbi_neighbors.ncbi_api_key = args.ncbi_api_key # Read the genomes from FASTA sequences genomes_grouped = [] genomes_grouped_names = [] for ds in args.dataset: if ds.startswith('collection:'): # Process a collection of datasets collection_name = ds[len('collection:'):] try: collection = importlib.import_module( 'catch.datasets.collections.' + collection_name) except ImportError: raise ValueError("Unknown dataset collection %s" % collection_name) for name, dataset in collection.import_all(): genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [name] elif ds.startswith('download:'): # Download a FASTA for an NCBI taxonomic ID taxid = ds[len('download:'):] if args.write_taxid_acc: taxid_fn = os.path.join(args.write_taxid_acc, str(taxid) + '.txt') else: taxid_fn = None if '-' in taxid: taxid, segment = taxid.split('-') else: segment = None ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid( taxid, segment=segment, write_to=taxid_fn) genomes_grouped += [ seq_io.read_genomes_from_fasta(ds_fasta_tf.name) ] genomes_grouped_names += ['taxid:' + str(taxid)] ds_fasta_tf.close() elif os.path.isfile(ds): # Process a custom fasta file with sequences genomes_grouped += [seq_io.read_genomes_from_fasta(ds)] genomes_grouped_names += [os.path.basename(ds)] else: # Process an individual dataset try: dataset = importlib.import_module('catch.datasets.' + ds) except ImportError: raise ValueError("Unknown file or dataset '%s'" % ds) genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [ds] if (args.limit_target_genomes and args.limit_target_genomes_randomly_with_replacement): raise Exception(("Cannot --limit-target-genomes and " "--limit-target-genomes-randomly-with-replacement at " "the same time")) elif args.limit_target_genomes: genomes_grouped = [ genomes[:args.limit_target_genomes] for genomes in genomes_grouped ] elif args.limit_target_genomes_randomly_with_replacement: k = args.limit_target_genomes_randomly_with_replacement genomes_grouped = [ random.choices(genomes, k=k) for genomes in genomes_grouped ] # Store the FASTA paths of blacklisted genomes blacklisted_genomes_fasta = [] if args.blacklist_genomes: for bg in args.blacklist_genomes: if os.path.isfile(bg): # Process a custom fasta file with sequences blacklisted_genomes_fasta += [bg] else: # Process an individual dataset try: dataset = importlib.import_module('catch.datasets.' + bg) except ImportError: raise ValueError("Unknown file or dataset '%s'" % bg) for fp in dataset.fasta_paths: blacklisted_genomes_fasta += [fp] # Setup and verify parameters related to probe length if not args.lcf_thres: args.lcf_thres = args.probe_length if args.probe_stride > args.probe_length: logger.warning(("PROBE_STRIDE (%d) is greater than PROBE_LENGTH " "(%d), which is usually undesirable and may lead " "to undefined behavior"), args.probe_stride, args.probe_length) if args.lcf_thres > args.probe_length: logger.warning(("LCF_THRES (%d) is greater than PROBE_LENGTH " "(%d), which is usually undesirable and may lead " "to undefined behavior"), args.lcf_thres, args.probe_length) if args.island_of_exact_match > args.probe_length: logger.warning(("ISLAND_OF_EXACT_MATCH (%d) is greater than " "PROBE_LENGTH (%d), which is usually undesirable " "and may lead to undefined behavior"), args.island_of_exact_match, args.probe_length) # Setup and verify parameters related to k-mer length in probe map if args.kmer_probe_map_k: # Check that k is sufficiently small if args.kmer_probe_map_k > args.probe_length: raise Exception(("KMER_PROBE_MAP_K (%d) exceeds PROBE_LENGTH " "(%d), which is not permitted") % (args.kmer_probe_map_k, args.probe_length)) # Use this value for the SetCoverFilter, AdapterFilter, and # the Analyzer kmer_probe_map_k_scf = args.kmer_probe_map_k kmer_probe_map_k_af = args.kmer_probe_map_k kmer_probe_map_k_analyzer = args.kmer_probe_map_k else: if args.probe_length <= 20: logger.warning(("PROBE_LENGTH (%d) is small; you may want to " "consider setting --kmer-probe-map-k to be " "small as well in order to be more sensitive " "in mapping candidate probes to target sequence"), args.probe_length) # Use a default k of 20 for the SetCoverFilter and AdapterFilter, # and 10 for the Analyzer since we would like to be more sensitive # (potentially at the cost of slower runtime) for the latter kmer_probe_map_k_scf = 20 kmer_probe_map_k_af = 20 kmer_probe_map_k_analyzer = 10 # Set the maximum number of processes in multiprocessing pools if args.max_num_processes: probe.set_max_num_processes_for_probe_finding_pools( args.max_num_processes) cluster.set_max_num_processes_for_creating_distance_matrix( args.max_num_processes) # Raise exceptions or warn based on use of adapter arguments if args.add_adapters: if not (args.adapter_a or args.adapter_b): logger.warning(("Adapter sequences will be added, but default " "sequences will be used; to provide adapter " "sequences, use --adapter-a and --adapter-b")) else: if args.adapter_a or args.adapter_b: raise Exception( ("Adapter sequences were provided with " "--adapter-a and --adapter-b, but --add-adapters is required " "to add adapter sequences onto the ends of probes")) # Do not allow both --small-seq-skip and --small-seq-min, since they # have different intentions if args.small_seq_skip is not None and args.small_seq_min is not None: raise Exception(("Both --small-seq-skip and --small-seq-min were " "specified, but both cannot be used together")) # Check arguments involving clustering if args.cluster_and_design_separately and args.identify: raise Exception( ("Cannot use --cluster-and-design-separately with " "--identify, because clustering collapses genome groupings into " "one")) if args.cluster_from_fragments and not args.cluster_and_design_separately: raise Exception(("Cannot use --cluster-from-fragments without also " "setting --cluster-and-design-separately")) # Check for whether a custom hybridization function was provided if args.custom_hybridization_fn: custom_cover_range_fn = tuple(args.custom_hybridization_fn) else: custom_cover_range_fn = None if args.custom_hybridization_fn_tolerant: custom_cover_range_tolerant_fn = tuple( args.custom_hybridization_fn_tolerant) else: custom_cover_range_tolerant_fn = None # Setup the filters # The filters we use are, in order: filters = [] # [Optional] # Fasta filter (ff) -- leave out candidate probes if args.filter_from_fasta: ff = fasta_filter.FastaFilter(args.filter_from_fasta, skip_reverse_complements=True) filters += [ff] # [Optional] # Poly(A) filter (paf) -- leave out probes with stretches of 'A' or 'T' if args.filter_polya: polya_length, polya_mismatches = args.filter_polya if polya_length > args.probe_length: logger.warning(("Length of poly(A) stretch to filter (%d) is " "greater than PROBE_LENGTH (%d), which is usually " "undesirable"), polya_length, args.probe_length) if polya_length < 10: logger.warning(("Length of poly(A) stretch to filter (%d) is " "short, and may lead to many probes being " "filtered"), polya_length) if polya_mismatches > 10: logger.warning(("Number of mismatches to tolerate when searching " "for poly(A) stretches (%d) is high, and may " "lead to many probes being filtered"), polya_mismatches) paf = polya_filter.PolyAFilter(polya_length, polya_mismatches) filters += [paf] # Duplicate filter (df) -- condense all candidate probes that # are identical down to one; this is not necessary for # correctness, as the set cover filter achieves the same task # implicitly, but it does significantly lower runtime by # decreasing the input size to the set cover filter # Near duplicate filter (ndf) -- condense candidate probes that # are near-duplicates down to one using locality-sensitive # hashing; like the duplicate filter, this is not necessary # but can significantly lower runtime and reduce memory usage # (even more than the duplicate filter) if (args.filter_with_lsh_hamming is not None and args.filter_with_lsh_minhash is not None): raise Exception(("Cannot use both --filter-with-lsh-hamming " "and --filter-with-lsh-minhash")) if args.filter_with_lsh_hamming is not None: if args.filter_with_lsh_hamming > args.mismatches: logger.warning( ("Setting FILTER_WITH_LSH_HAMMING (%d) to be greater " "than MISMATCHES (%d) may cause the probes to achieve less " "than the desired coverage"), args.filter_with_lsh_hamming, args.mismatches) ndf = near_duplicate_filter.NearDuplicateFilterWithHammingDistance( args.filter_with_lsh_hamming, args.probe_length) filters += [ndf] elif args.filter_with_lsh_minhash is not None: ndf = near_duplicate_filter.NearDuplicateFilterWithMinHash( args.filter_with_lsh_minhash) filters += [ndf] else: df = duplicate_filter.DuplicateFilter() filters += [df] # Set cover filter (scf) -- solve the problem by treating it as # an instance of the set cover problem scf = set_cover_filter.SetCoverFilter( mismatches=args.mismatches, lcf_thres=args.lcf_thres, island_of_exact_match=args.island_of_exact_match, mismatches_tolerant=args.mismatches_tolerant, lcf_thres_tolerant=args.lcf_thres_tolerant, island_of_exact_match_tolerant=args.island_of_exact_match_tolerant, custom_cover_range_fn=custom_cover_range_fn, custom_cover_range_tolerant_fn=custom_cover_range_tolerant_fn, identify=args.identify, blacklisted_genomes=blacklisted_genomes_fasta, coverage=args.coverage, cover_extension=args.cover_extension, cover_groupings_separately=args.cover_groupings_separately, kmer_probe_map_k=kmer_probe_map_k_scf, kmer_probe_map_use_native_dict=args. use_native_dict_when_finding_tolerant_coverage) filters += [scf] # [Optional] # Adapter filter (af) -- add adapters to both the 5' and 3' ends # of each probe if args.add_adapters: # Set default adapter sequences, if not provided if args.adapter_a: adapter_a = tuple(args.adapter_a) else: adapter_a = ('ATACGCCATGCTGGGTCTCC', 'CGTACTTGGGAGTCGGCCAT') if args.adapter_b: adapter_b = tuple(args.adapter_b) else: adapter_b = ('AGGCCCTGGCTGCTGATATG', 'GACCTTTTGGGACAGCGGTG') af = adapter_filter.AdapterFilter(adapter_a, adapter_b, mismatches=args.mismatches, lcf_thres=args.lcf_thres, island_of_exact_match=\ args.island_of_exact_match, custom_cover_range_fn=\ custom_cover_range_fn, kmer_probe_map_k=kmer_probe_map_k_af) filters += [af] # [Optional] # N expansion filter (nef) -- expand Ns in probe sequences # to avoid ambiguity if args.expand_n is not None: nef = n_expansion_filter.NExpansionFilter( limit_n_expansion_randomly=args.expand_n) filters += [nef] # [Optional] # Reverse complement (rc) -- add the reverse complement of each # probe that remains if args.add_reverse_complements: rc = reverse_complement_filter.ReverseComplementFilter() filters += [rc] # If requested, don't apply the set cover filter if args.skip_set_cover: filter_before_scf = filters[filters.index(scf) - 1] filters.remove(scf) # Define parameters for clustering sequences if args.cluster_and_design_separately: cluster_threshold = args.cluster_and_design_separately if args.skip_set_cover: cluster_merge_after = filter_before_scf else: cluster_merge_after = scf cluster_fragment_length = args.cluster_from_fragments else: cluster_threshold = None cluster_merge_after = None cluster_fragment_length = None # Design the probes pb = probe_designer.ProbeDesigner( genomes_grouped, filters, probe_length=args.probe_length, probe_stride=args.probe_stride, allow_small_seqs=args.small_seq_min, seq_length_to_skip=args.small_seq_skip, cluster_threshold=cluster_threshold, cluster_merge_after=cluster_merge_after, cluster_fragment_length=cluster_fragment_length) pb.design() # Write the final probes to the file args.output_probes seq_io.write_probe_fasta(pb.final_probes, args.output_probes) if (args.print_analysis or args.write_analysis_to_tsv or args.write_sliding_window_coverage or args.write_probe_map_counts_to_tsv): analyzer = coverage_analysis.Analyzer( pb.final_probes, args.mismatches, args.lcf_thres, genomes_grouped, genomes_grouped_names, island_of_exact_match=args.island_of_exact_match, custom_cover_range_fn=custom_cover_range_fn, cover_extension=args.cover_extension, kmer_probe_map_k=kmer_probe_map_k_analyzer, rc_too=args.add_reverse_complements) analyzer.run() if args.write_analysis_to_tsv: analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv) if args.write_sliding_window_coverage: analyzer.write_sliding_window_coverage( args.write_sliding_window_coverage) if args.write_probe_map_counts_to_tsv: analyzer.write_probe_map_counts(args.write_probe_map_counts_to_tsv) if args.print_analysis: analyzer.print_analysis() else: # Just print the number of probes print(len(pb.final_probes))