예제 #1
0
    def setUp(self):
        # Disable logging
        logging.disable(logging.INFO)

        assert len(ebola_zaire_with_2014.fasta_paths) == 1
        self.seqs_map = seq_io.read_fasta(ebola_zaire_with_2014.fasta_paths[0])
        self.seqs = list(self.seqs_map.values())
예제 #2
0
    def _filter(self, input):
        """Return a subset of the input probes.
        """
        # Read the FASTA file
        fasta = seq_io.read_fasta(self.fasta_path)

        # Construct a set of the sequences from the file
        seqs_to_keep = {}
        for i, (header, seq) in enumerate(fasta.items()):
            if self.skip_reverse_complements:
                if "reverse complement" not in header:
                    seqs_to_keep[seq] = i
            else:
                seqs_to_keep[seq] = i

        # Construct a list of tuples of the form
        # (probe's position in fasta file, probe)
        filtered = []
        for probe in input:
            if probe.seq_str in seqs_to_keep:
                order_in_file = seqs_to_keep[probe.seq_str]
                filtered += [(order_in_file, probe)]

        # Sort the filtered probes by their order
        filtered.sort()

        # Remove ordering information from the filtered list
        for i, (order_in_file, probe) in enumerate(filtered):
            filtered[i] = probe

        return filtered
예제 #3
0
def main(args):
    # Read the genomes from FASTA sequences
    genomes_grouped = []
    genomes_grouped_names = []
    for ds in args.dataset:
        if ds.startswith('download:'):
            # Download a FASTA for an NCBI taxonomic ID
            taxid = ds[len('download:'):]
            ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid(taxid)
            genomes_grouped += [
                seq_io.read_genomes_from_fasta(ds_fasta_tf.name)
            ]
            genomes_grouped_names += ['taxid:' + str(taxid)]
            ds_fasta_tf.close()
        elif os.path.isfile(ds):
            # Process a custom fasta file with sequences
            genomes_grouped += [seq_io.read_genomes_from_fasta(ds)]
            genomes_grouped_names += [os.path.basename(ds)]
        else:
            # Process an individual dataset
            try:
                dataset = importlib.import_module('catch.datasets.' + ds)
            except ImportError:
                raise ValueError("Unknown dataset %s" % ds)
            genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
            genomes_grouped_names += [ds]

    if args.limit_target_genomes:
        genomes_grouped = [
            genomes[:args.limit_target_genomes] for genomes in genomes_grouped
        ]

    # Set the maximum number of processes in multiprocessing pools
    if args.max_num_processes:
        probe.set_max_num_processes_for_probe_finding_pools(
            args.max_num_processes)

    # Read the FASTA file of probes
    fasta = seq_io.read_fasta(args.probes_fasta)
    probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()]

    # Run the coverage analyzer
    analyzer = coverage_analysis.Analyzer(
        probes,
        args.mismatches,
        args.lcf_thres,
        genomes_grouped,
        genomes_grouped_names,
        island_of_exact_match=args.island_of_exact_match,
        cover_extension=args.cover_extension,
        kmer_probe_map_k=args.kmer_probe_map_k)
    analyzer.run()
    if args.write_analysis_to_tsv:
        analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv)
    if args.write_sliding_window_coverage:
        analyzer.write_sliding_window_coverage(
            args.write_sliding_window_coverage)
    if args.print_analysis:
        analyzer.print_analysis()
예제 #4
0
    def test_construct_fasta_for_taxid(self):
        # Download Zika virus sequences
        fasta_tf = nn.construct_fasta_for_taxid(TAXIDS['ZIKV'])

        # Read the fasta
        seqs = seq_io.read_fasta(fasta_tf.name)

        # Check that there are at least 100 sequences (there should be
        # many more)
        self.assertGreaterEqual(len(seqs), 100)

        fasta_tf.close()
예제 #5
0
    def test_single_chr_dataset(self):
        """Tests that the genomes obtained from reading the
        ebola_zaire_with_2014 dataset are the same as those obtained
        from directly reading the FASTA.

        This is effectively executing most of the same code as
        seq_io.read_dataset_genomes() but does check that it correctly
        enters the condition of reading just one sequence per genome.
        """
        genomes = seq_io.read_dataset_genomes(ebola_zaire_with_2014)
        assert len(ebola_zaire_with_2014.fasta_paths) == 1
        desired_genomes = [
            genome.Genome.from_one_seq(s)
            for s in seq_io.read_fasta(ebola_zaire_with_2014.fasta_paths[0]).\
                values()
        ]
        self.assertEqual(genomes, desired_genomes)
예제 #6
0
def main(args):
    # Read the genomes from FASTA sequences
    genomes_grouped = []
    genomes_grouped_names = []
    for ds in args.dataset:
        try:
            dataset = importlib.import_module('catch.datasets.' + ds)
        except ImportError:
            raise ValueError("Unknown dataset %s" % ds)
        genomes_grouped += [seq_io.read_dataset_genomes(dataset)]
        genomes_grouped_names += [ds]

    if args.limit_target_genomes:
        genomes_grouped = [
            genomes[:args.limit_target_genomes] for genomes in genomes_grouped
        ]

    # Set the maximum number of processes in multiprocessing pools
    if args.max_num_processes:
        probe.set_max_num_processes_for_probe_finding_pools(
            args.max_num_processes)

    # Read the FASTA file of probes
    fasta = seq_io.read_fasta(args.probes_fasta)
    probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()]

    # Run the coverage analyzer
    analyzer = coverage_analysis.Analyzer(
        probes,
        args.mismatches,
        args.lcf_thres,
        genomes_grouped,
        genomes_grouped_names,
        island_of_exact_match=args.island_of_exact_match,
        cover_extension=args.cover_extension,
        kmer_probe_map_k=args.kmer_probe_map_k)
    analyzer.run()
    if args.write_analysis_to_tsv:
        analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv)
    if args.write_sliding_window_coverage:
        analyzer.write_sliding_window_coverage(
            args.write_sliding_window_coverage)
    if args.print_analysis:
        analyzer.print_analysis()
예제 #7
0
    def test_fetch_fastas(self):
        # Download Zika virus accessions
        fasta_tf = nn.fetch_fastas(ZIKV_ACCS)

        # Read the fasta
        seqs = seq_io.read_fasta(fasta_tf.name)

        # Verify that the right number of sequences were fetched
        self.assertEqual(len(seqs), len(ZIKV_ACCS))

        # Verify that each accession appears in a sequence header (it may
        # not match exactly because the sequence header is
        # [accession].[verison], but the accession should be a substring)
        for acc in ZIKV_ACCS:
            num_with_acc = sum(1 for seq_header in seqs.keys()
                               if acc in seq_header)
            self.assertEqual(num_with_acc, 1)

        fasta_tf.close()
예제 #8
0
 def test_read(self):
     seqs = seq_io.read_fasta(self.fasta.name)
     self.assertEqual(seqs, self.expected)