Exemple #1
0
    def test_basic(self):
        fasta_file = tempfile.NamedTemporaryFile(mode='w')
        fasta_file.write(">probe1\n")
        fasta_file.write("ATCGATCG\n")
        fasta_file.write(">probe2\n")
        fasta_file.write("GGGGG\n")
        fasta_file.seek(0)

        p1 = probe.Probe.from_str('CCCCC')
        p2 = probe.Probe.from_str('GGGGG')
        p3 = probe.Probe.from_str('ATCGTTTT')
        p4 = probe.Probe.from_str('ATCGATCG')
        p5 = probe.Probe.from_str('NNNNATCG')
        input_probes = [p1, p2, p3, p4, p5]

        fasta_filter = ff.FastaFilter(fasta_file.name)
        fasta_filter.filter(input_probes)
        self.assertEqual(fasta_filter.output_probes, [p4, p2])

        fasta_file.close()
Exemple #2
0
    def test_skip_reverse_complements(self):
        fasta_file = tempfile.NamedTemporaryFile(mode='w')
        fasta_file.write(">probe1\n")
        fasta_file.write("ATCGATCG\n")
        fasta_file.write(">probe2 | reverse complement of probe1\n")
        fasta_file.write("CGATCGAT\n")
        fasta_file.write(">probe3\n")
        fasta_file.write("GGGGG\n")
        fasta_file.seek(0)

        p1 = probe.Probe.from_str('CCCCC')
        p2 = probe.Probe.from_str('GGGGG')
        p3 = probe.Probe.from_str('ATCGTTTT')
        p4 = probe.Probe.from_str('ATCGATCG')
        p5 = probe.Probe.from_str('CGATCGAT')
        p6 = probe.Probe.from_str('NNNNATCG')
        input_probes = [p1, p2, p3, p4, p5, p6]

        fasta_filter = ff.FastaFilter(fasta_file.name,
                                      skip_reverse_complements=True)
        fasta_filter.filter(input_probes)
        self.assertEqual(fasta_filter.output_probes, [p4, p2])

        fasta_file.close()
Exemple #3
0
def main(args):
    logger = logging.getLogger(__name__)

    # Set NCBI API key
    if args.ncbi_api_key:
        ncbi_neighbors.ncbi_api_key = args.ncbi_api_key

    # Read the genomes from FASTA sequences
    genomes_grouped = []
    genomes_grouped_names = []
    for ds in args.dataset:
        if ds.startswith('collection:'):
            # Process a collection of datasets
            collection_name = ds[len('collection:'):]
            try:
                collection = importlib.import_module(
                    'catch.datasets.collections.' + collection_name)
            except ImportError:
                raise ValueError("Unknown dataset collection %s" %
                                 collection_name)
            for name, dataset in collection.import_all():
                genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
                genomes_grouped_names += [name]
        elif ds.startswith('download:'):
            # Download a FASTA for an NCBI taxonomic ID
            taxid = ds[len('download:'):]
            if args.write_taxid_acc:
                taxid_fn = os.path.join(args.write_taxid_acc,
                                        str(taxid) + '.txt')
            else:
                taxid_fn = None
            if '-' in taxid:
                taxid, segment = taxid.split('-')
            else:
                segment = None
            ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid(
                taxid, segment=segment, write_to=taxid_fn)
            genomes_grouped += [
                seq_io.read_genomes_from_fasta(ds_fasta_tf.name)
            ]
            genomes_grouped_names += ['taxid:' + str(taxid)]
            ds_fasta_tf.close()
        elif os.path.isfile(ds):
            # Process a custom fasta file with sequences
            genomes_grouped += [seq_io.read_genomes_from_fasta(ds)]
            genomes_grouped_names += [os.path.basename(ds)]
        else:
            # Process an individual dataset
            try:
                dataset = importlib.import_module('catch.datasets.' + ds)
            except ImportError:
                raise ValueError("Unknown file or dataset '%s'" % ds)
            genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
            genomes_grouped_names += [ds]

    if (args.limit_target_genomes
            and args.limit_target_genomes_randomly_with_replacement):
        raise Exception(("Cannot --limit-target-genomes and "
                         "--limit-target-genomes-randomly-with-replacement at "
                         "the same time"))
    elif args.limit_target_genomes:
        genomes_grouped = [
            genomes[:args.limit_target_genomes] for genomes in genomes_grouped
        ]
    elif args.limit_target_genomes_randomly_with_replacement:
        k = args.limit_target_genomes_randomly_with_replacement
        genomes_grouped = [
            random.choices(genomes, k=k) for genomes in genomes_grouped
        ]

    # Store the FASTA paths of blacklisted genomes
    blacklisted_genomes_fasta = []
    if args.blacklist_genomes:
        for bg in args.blacklist_genomes:
            if os.path.isfile(bg):
                # Process a custom fasta file with sequences
                blacklisted_genomes_fasta += [bg]
            else:
                # Process an individual dataset
                try:
                    dataset = importlib.import_module('catch.datasets.' + bg)
                except ImportError:
                    raise ValueError("Unknown file or dataset '%s'" % bg)
                for fp in dataset.fasta_paths:
                    blacklisted_genomes_fasta += [fp]

    # Setup and verify parameters related to probe length
    if not args.lcf_thres:
        args.lcf_thres = args.probe_length
    if args.probe_stride > args.probe_length:
        logger.warning(("PROBE_STRIDE (%d) is greater than PROBE_LENGTH "
                        "(%d), which is usually undesirable and may lead "
                        "to undefined behavior"), args.probe_stride,
                       args.probe_length)
    if args.lcf_thres > args.probe_length:
        logger.warning(("LCF_THRES (%d) is greater than PROBE_LENGTH "
                        "(%d), which is usually undesirable and may lead "
                        "to undefined behavior"), args.lcf_thres,
                       args.probe_length)
    if args.island_of_exact_match > args.probe_length:
        logger.warning(("ISLAND_OF_EXACT_MATCH (%d) is greater than "
                        "PROBE_LENGTH (%d), which is usually undesirable "
                        "and may lead to undefined behavior"),
                       args.island_of_exact_match, args.probe_length)

    # Setup and verify parameters related to k-mer length in probe map
    if args.kmer_probe_map_k:
        # Check that k is sufficiently small
        if args.kmer_probe_map_k > args.probe_length:
            raise Exception(("KMER_PROBE_MAP_K (%d) exceeds PROBE_LENGTH "
                             "(%d), which is not permitted") %
                            (args.kmer_probe_map_k, args.probe_length))

        # Use this value for the SetCoverFilter, AdapterFilter, and
        # the Analyzer
        kmer_probe_map_k_scf = args.kmer_probe_map_k
        kmer_probe_map_k_af = args.kmer_probe_map_k
        kmer_probe_map_k_analyzer = args.kmer_probe_map_k
    else:
        if args.probe_length <= 20:
            logger.warning(("PROBE_LENGTH (%d) is small; you may want to "
                            "consider setting --kmer-probe-map-k to be "
                            "small as well in order to be more sensitive "
                            "in mapping candidate probes to target sequence"),
                           args.probe_length)

        # Use a default k of 20 for the SetCoverFilter and AdapterFilter,
        # and 10 for the Analyzer since we would like to be more sensitive
        # (potentially at the cost of slower runtime) for the latter
        kmer_probe_map_k_scf = 20
        kmer_probe_map_k_af = 20
        kmer_probe_map_k_analyzer = 10

    # Set the maximum number of processes in multiprocessing pools
    if args.max_num_processes:
        probe.set_max_num_processes_for_probe_finding_pools(
            args.max_num_processes)
        cluster.set_max_num_processes_for_creating_distance_matrix(
            args.max_num_processes)

    # Raise exceptions or warn based on use of adapter arguments
    if args.add_adapters:
        if not (args.adapter_a or args.adapter_b):
            logger.warning(("Adapter sequences will be added, but default "
                            "sequences will be used; to provide adapter "
                            "sequences, use --adapter-a and --adapter-b"))
    else:
        if args.adapter_a or args.adapter_b:
            raise Exception(
                ("Adapter sequences were provided with "
                 "--adapter-a and --adapter-b, but --add-adapters is required "
                 "to add adapter sequences onto the ends of probes"))

    # Do not allow both --small-seq-skip and --small-seq-min, since they
    # have different intentions
    if args.small_seq_skip is not None and args.small_seq_min is not None:
        raise Exception(("Both --small-seq-skip and --small-seq-min were "
                         "specified, but both cannot be used together"))

    # Check arguments involving clustering
    if args.cluster_and_design_separately and args.identify:
        raise Exception(
            ("Cannot use --cluster-and-design-separately with "
             "--identify, because clustering collapses genome groupings into "
             "one"))
    if args.cluster_from_fragments and not args.cluster_and_design_separately:
        raise Exception(("Cannot use --cluster-from-fragments without also "
                         "setting --cluster-and-design-separately"))

    # Check for whether a custom hybridization function was provided
    if args.custom_hybridization_fn:
        custom_cover_range_fn = tuple(args.custom_hybridization_fn)
    else:
        custom_cover_range_fn = None
    if args.custom_hybridization_fn_tolerant:
        custom_cover_range_tolerant_fn = tuple(
            args.custom_hybridization_fn_tolerant)
    else:
        custom_cover_range_tolerant_fn = None

    # Setup the filters
    # The filters we use are, in order:
    filters = []

    # [Optional]
    # Fasta filter (ff) -- leave out candidate probes
    if args.filter_from_fasta:
        ff = fasta_filter.FastaFilter(args.filter_from_fasta,
                                      skip_reverse_complements=True)
        filters += [ff]

    # [Optional]
    # Poly(A) filter (paf) -- leave out probes with stretches of 'A' or 'T'
    if args.filter_polya:
        polya_length, polya_mismatches = args.filter_polya
        if polya_length > args.probe_length:
            logger.warning(("Length of poly(A) stretch to filter (%d) is "
                            "greater than PROBE_LENGTH (%d), which is usually "
                            "undesirable"), polya_length, args.probe_length)
        if polya_length < 10:
            logger.warning(("Length of poly(A) stretch to filter (%d) is "
                            "short, and may lead to many probes being "
                            "filtered"), polya_length)
        if polya_mismatches > 10:
            logger.warning(("Number of mismatches to tolerate when searching "
                            "for poly(A) stretches (%d) is high, and may "
                            "lead to many probes being filtered"),
                           polya_mismatches)
        paf = polya_filter.PolyAFilter(polya_length, polya_mismatches)
        filters += [paf]

    # Duplicate filter (df) -- condense all candidate probes that
    #     are identical down to one; this is not necessary for
    #     correctness, as the set cover filter achieves the same task
    #     implicitly, but it does significantly lower runtime by
    #     decreasing the input size to the set cover filter
    # Near duplicate filter (ndf) -- condense candidate probes that
    #     are near-duplicates down to one using locality-sensitive
    #     hashing; like the duplicate filter, this is not necessary
    #     but can significantly lower runtime and reduce memory usage
    #     (even more than the duplicate filter)
    if (args.filter_with_lsh_hamming is not None
            and args.filter_with_lsh_minhash is not None):
        raise Exception(("Cannot use both --filter-with-lsh-hamming "
                         "and --filter-with-lsh-minhash"))
    if args.filter_with_lsh_hamming is not None:
        if args.filter_with_lsh_hamming > args.mismatches:
            logger.warning(
                ("Setting FILTER_WITH_LSH_HAMMING (%d) to be greater "
                 "than MISMATCHES (%d) may cause the probes to achieve less "
                 "than the desired coverage"), args.filter_with_lsh_hamming,
                args.mismatches)
        ndf = near_duplicate_filter.NearDuplicateFilterWithHammingDistance(
            args.filter_with_lsh_hamming, args.probe_length)
        filters += [ndf]
    elif args.filter_with_lsh_minhash is not None:
        ndf = near_duplicate_filter.NearDuplicateFilterWithMinHash(
            args.filter_with_lsh_minhash)
        filters += [ndf]
    else:
        df = duplicate_filter.DuplicateFilter()
        filters += [df]

    # Set cover filter (scf) -- solve the problem by treating it as
    #     an instance of the set cover problem
    scf = set_cover_filter.SetCoverFilter(
        mismatches=args.mismatches,
        lcf_thres=args.lcf_thres,
        island_of_exact_match=args.island_of_exact_match,
        mismatches_tolerant=args.mismatches_tolerant,
        lcf_thres_tolerant=args.lcf_thres_tolerant,
        island_of_exact_match_tolerant=args.island_of_exact_match_tolerant,
        custom_cover_range_fn=custom_cover_range_fn,
        custom_cover_range_tolerant_fn=custom_cover_range_tolerant_fn,
        identify=args.identify,
        blacklisted_genomes=blacklisted_genomes_fasta,
        coverage=args.coverage,
        cover_extension=args.cover_extension,
        cover_groupings_separately=args.cover_groupings_separately,
        kmer_probe_map_k=kmer_probe_map_k_scf,
        kmer_probe_map_use_native_dict=args.
        use_native_dict_when_finding_tolerant_coverage)
    filters += [scf]

    # [Optional]
    # Adapter filter (af) -- add adapters to both the 5' and 3' ends
    #    of each probe
    if args.add_adapters:
        # Set default adapter sequences, if not provided
        if args.adapter_a:
            adapter_a = tuple(args.adapter_a)
        else:
            adapter_a = ('ATACGCCATGCTGGGTCTCC', 'CGTACTTGGGAGTCGGCCAT')
        if args.adapter_b:
            adapter_b = tuple(args.adapter_b)
        else:
            adapter_b = ('AGGCCCTGGCTGCTGATATG', 'GACCTTTTGGGACAGCGGTG')

        af = adapter_filter.AdapterFilter(adapter_a,
                                          adapter_b,
                                          mismatches=args.mismatches,
                                          lcf_thres=args.lcf_thres,
                                          island_of_exact_match=\
                                            args.island_of_exact_match,
                                          custom_cover_range_fn=\
                                            custom_cover_range_fn,
                                          kmer_probe_map_k=kmer_probe_map_k_af)
        filters += [af]

    # [Optional]
    # N expansion filter (nef) -- expand Ns in probe sequences
    # to avoid ambiguity
    if args.expand_n is not None:
        nef = n_expansion_filter.NExpansionFilter(
            limit_n_expansion_randomly=args.expand_n)
        filters += [nef]

    # [Optional]
    # Reverse complement (rc) -- add the reverse complement of each
    #    probe that remains
    if args.add_reverse_complements:
        rc = reverse_complement_filter.ReverseComplementFilter()
        filters += [rc]

    # If requested, don't apply the set cover filter
    if args.skip_set_cover:
        filter_before_scf = filters[filters.index(scf) - 1]
        filters.remove(scf)

    # Define parameters for clustering sequences
    if args.cluster_and_design_separately:
        cluster_threshold = args.cluster_and_design_separately
        if args.skip_set_cover:
            cluster_merge_after = filter_before_scf
        else:
            cluster_merge_after = scf
        cluster_fragment_length = args.cluster_from_fragments
    else:
        cluster_threshold = None
        cluster_merge_after = None
        cluster_fragment_length = None

    # Design the probes
    pb = probe_designer.ProbeDesigner(
        genomes_grouped,
        filters,
        probe_length=args.probe_length,
        probe_stride=args.probe_stride,
        allow_small_seqs=args.small_seq_min,
        seq_length_to_skip=args.small_seq_skip,
        cluster_threshold=cluster_threshold,
        cluster_merge_after=cluster_merge_after,
        cluster_fragment_length=cluster_fragment_length)
    pb.design()

    # Write the final probes to the file args.output_probes
    seq_io.write_probe_fasta(pb.final_probes, args.output_probes)

    if (args.print_analysis or args.write_analysis_to_tsv
            or args.write_sliding_window_coverage
            or args.write_probe_map_counts_to_tsv):
        analyzer = coverage_analysis.Analyzer(
            pb.final_probes,
            args.mismatches,
            args.lcf_thres,
            genomes_grouped,
            genomes_grouped_names,
            island_of_exact_match=args.island_of_exact_match,
            custom_cover_range_fn=custom_cover_range_fn,
            cover_extension=args.cover_extension,
            kmer_probe_map_k=kmer_probe_map_k_analyzer,
            rc_too=args.add_reverse_complements)
        analyzer.run()
        if args.write_analysis_to_tsv:
            analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv)
        if args.write_sliding_window_coverage:
            analyzer.write_sliding_window_coverage(
                args.write_sliding_window_coverage)
        if args.write_probe_map_counts_to_tsv:
            analyzer.write_probe_map_counts(args.write_probe_map_counts_to_tsv)
        if args.print_analysis:
            analyzer.print_analysis()
    else:
        # Just print the number of probes
        print(len(pb.final_probes))