def setUp(self): # Disable logging logging.disable(logging.INFO) assert len(ebola_zaire_with_2014.fasta_paths) == 1 self.seqs_map = seq_io.read_fasta(ebola_zaire_with_2014.fasta_paths[0]) self.seqs = list(self.seqs_map.values())
def _filter(self, input): """Return a subset of the input probes. """ # Read the FASTA file fasta = seq_io.read_fasta(self.fasta_path) # Construct a set of the sequences from the file seqs_to_keep = {} for i, (header, seq) in enumerate(fasta.items()): if self.skip_reverse_complements: if "reverse complement" not in header: seqs_to_keep[seq] = i else: seqs_to_keep[seq] = i # Construct a list of tuples of the form # (probe's position in fasta file, probe) filtered = [] for probe in input: if probe.seq_str in seqs_to_keep: order_in_file = seqs_to_keep[probe.seq_str] filtered += [(order_in_file, probe)] # Sort the filtered probes by their order filtered.sort() # Remove ordering information from the filtered list for i, (order_in_file, probe) in enumerate(filtered): filtered[i] = probe return filtered
def main(args): # Read the genomes from FASTA sequences genomes_grouped = [] genomes_grouped_names = [] for ds in args.dataset: if ds.startswith('download:'): # Download a FASTA for an NCBI taxonomic ID taxid = ds[len('download:'):] ds_fasta_tf = ncbi_neighbors.construct_fasta_for_taxid(taxid) genomes_grouped += [ seq_io.read_genomes_from_fasta(ds_fasta_tf.name) ] genomes_grouped_names += ['taxid:' + str(taxid)] ds_fasta_tf.close() elif os.path.isfile(ds): # Process a custom fasta file with sequences genomes_grouped += [seq_io.read_genomes_from_fasta(ds)] genomes_grouped_names += [os.path.basename(ds)] else: # Process an individual dataset try: dataset = importlib.import_module('catch.datasets.' + ds) except ImportError: raise ValueError("Unknown dataset %s" % ds) genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [ds] if args.limit_target_genomes: genomes_grouped = [ genomes[:args.limit_target_genomes] for genomes in genomes_grouped ] # Set the maximum number of processes in multiprocessing pools if args.max_num_processes: probe.set_max_num_processes_for_probe_finding_pools( args.max_num_processes) # Read the FASTA file of probes fasta = seq_io.read_fasta(args.probes_fasta) probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()] # Run the coverage analyzer analyzer = coverage_analysis.Analyzer( probes, args.mismatches, args.lcf_thres, genomes_grouped, genomes_grouped_names, island_of_exact_match=args.island_of_exact_match, cover_extension=args.cover_extension, kmer_probe_map_k=args.kmer_probe_map_k) analyzer.run() if args.write_analysis_to_tsv: analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv) if args.write_sliding_window_coverage: analyzer.write_sliding_window_coverage( args.write_sliding_window_coverage) if args.print_analysis: analyzer.print_analysis()
def test_construct_fasta_for_taxid(self): # Download Zika virus sequences fasta_tf = nn.construct_fasta_for_taxid(TAXIDS['ZIKV']) # Read the fasta seqs = seq_io.read_fasta(fasta_tf.name) # Check that there are at least 100 sequences (there should be # many more) self.assertGreaterEqual(len(seqs), 100) fasta_tf.close()
def test_single_chr_dataset(self): """Tests that the genomes obtained from reading the ebola_zaire_with_2014 dataset are the same as those obtained from directly reading the FASTA. This is effectively executing most of the same code as seq_io.read_dataset_genomes() but does check that it correctly enters the condition of reading just one sequence per genome. """ genomes = seq_io.read_dataset_genomes(ebola_zaire_with_2014) assert len(ebola_zaire_with_2014.fasta_paths) == 1 desired_genomes = [ genome.Genome.from_one_seq(s) for s in seq_io.read_fasta(ebola_zaire_with_2014.fasta_paths[0]).\ values() ] self.assertEqual(genomes, desired_genomes)
def main(args): # Read the genomes from FASTA sequences genomes_grouped = [] genomes_grouped_names = [] for ds in args.dataset: try: dataset = importlib.import_module('catch.datasets.' + ds) except ImportError: raise ValueError("Unknown dataset %s" % ds) genomes_grouped += [seq_io.read_dataset_genomes(dataset)] genomes_grouped_names += [ds] if args.limit_target_genomes: genomes_grouped = [ genomes[:args.limit_target_genomes] for genomes in genomes_grouped ] # Set the maximum number of processes in multiprocessing pools if args.max_num_processes: probe.set_max_num_processes_for_probe_finding_pools( args.max_num_processes) # Read the FASTA file of probes fasta = seq_io.read_fasta(args.probes_fasta) probes = [probe.Probe.from_str(seq) for _, seq in fasta.items()] # Run the coverage analyzer analyzer = coverage_analysis.Analyzer( probes, args.mismatches, args.lcf_thres, genomes_grouped, genomes_grouped_names, island_of_exact_match=args.island_of_exact_match, cover_extension=args.cover_extension, kmer_probe_map_k=args.kmer_probe_map_k) analyzer.run() if args.write_analysis_to_tsv: analyzer.write_data_matrix_as_tsv(args.write_analysis_to_tsv) if args.write_sliding_window_coverage: analyzer.write_sliding_window_coverage( args.write_sliding_window_coverage) if args.print_analysis: analyzer.print_analysis()
def test_fetch_fastas(self): # Download Zika virus accessions fasta_tf = nn.fetch_fastas(ZIKV_ACCS) # Read the fasta seqs = seq_io.read_fasta(fasta_tf.name) # Verify that the right number of sequences were fetched self.assertEqual(len(seqs), len(ZIKV_ACCS)) # Verify that each accession appears in a sequence header (it may # not match exactly because the sequence header is # [accession].[verison], but the accession should be a substring) for acc in ZIKV_ACCS: num_with_acc = sum(1 for seq_header in seqs.keys() if acc in seq_header) self.assertEqual(num_with_acc, 1) fasta_tf.close()
def test_read(self): seqs = seq_io.read_fasta(self.fasta.name) self.assertEqual(seqs, self.expected)