def test_run_best_match_not_in_cluster(self): '''Test full run where there is a match in cluster, but better match to seq not in cluster''' all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_not_in_cluster.allrefs.fa') cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_not_in_cluster.clusterrefs.fa') contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_not_in_cluster.contigs.fa') tmp_out = 'tmp.ref_seq_chooser_full_run_best_match_not_in_cluster.fa' refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout) refchooser.run() self.assertEqual('ref2', refchooser.closest_ref_from_all_refs) self.assertFalse(refchooser.closest_ref_is_in_cluster) self.assertFalse(os.path.exists(tmp_out))
def test_run_no_nucmer_match(self): '''Test full run when there is nearest match in cluster, but no nucmer matches''' all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_no_nucmer_match.allrefs.fa') cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_no_nucmer_match.clusterrefs.fa') contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_no_nucmer_match.contigs.fa') tmp_out = 'tmp.ref_seq_chooser_full_run_no_nucmer_match.fa' refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout) refchooser.run() self.assertEqual(None, refchooser.closest_ref_from_all_refs) self.assertFalse(refchooser.closest_ref_is_in_cluster) self.assertFalse(os.path.exists(tmp_out))
def test_run_contained_ref_seq(self): '''Test full run where ref seq completely contains another seq outside cluster''' all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.all_refs.fa') cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.cluster_refs.fa') contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.contigs.fa') tmp_out = 'tmp.ref_seq_chooser_full_run_contained_ref_seq.fa' refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout) refchooser.run() self.assertEqual('ref2', refchooser.closest_ref_from_all_refs) self.assertTrue(refchooser.closest_ref_is_in_cluster) self.assertTrue(os.path.exists(tmp_out)) os.unlink(tmp_out)
def test_run_best_match_is_in_cluster(self): '''Test full run where the best match is in the cluster''' all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_is_in_cluster.allrefs.fa') cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_is_in_cluster.clusterrefs.fa') contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_is_in_cluster.contigs.fa') tmp_out = 'tmp.ref_seq_chooser_full_run_best_match_is_in_cluster.fa' refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout) refchooser.run() self.assertEqual('ref1', refchooser.closest_ref_from_all_refs) self.assertTrue(refchooser.closest_ref_is_in_cluster) self.assertTrue(os.path.exists(tmp_out)) os.unlink(tmp_out)
def test_run_flanking_different(self): '''Test full run where amount of flanking seq varies''' all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.all_refs.fa') cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.cluster_refs.fa') contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.contigs.fa') expected_fa = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.expected_contigs.fa') tmp_out = 'tmp.ref_seq_chooser_test_flanking.fa' refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout) refchooser.run() self.assertEqual('ref1', refchooser.closest_ref_from_all_refs) self.assertTrue(refchooser.closest_ref_is_in_cluster) self.assertTrue(filecmp.cmp(expected_fa, tmp_out, shallow=False)) os.unlink(tmp_out)
def run(self): if self.assembler == 'fermilite': self._assemble_with_fermilite() elif self.assembler == "spades": self._assemble_with_spades() print('Finished running assemblies', flush=True, file=self.log_fh) self.sequences = {} # double-check we got some contigs number_of_contigs = pyfastaq.tasks.count_sequences( self.all_assembly_contigs_fa) if os.path.exists( self.all_assembly_contigs_fa) else 0 if number_of_contigs == 0: self.assembled_ok = False # This is to make this object picklable, to keep multithreading happy self.log_fh = None return else: self.assembled_ok = True if self.assembled_ok: ref_chooser = ref_seq_chooser.RefSeqChooser( self.ref_fastas, self.all_reference_fasta, self.all_assembly_contigs_fa, self.best_assembly_fa, self.log_fh, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, ) ref_chooser.run() if ref_chooser.closest_ref_from_all_refs is None: print('Could not find match to reference sequences', file=self.log_fh) self.ref_seq_name = None self.log_fh = None return elif not ref_chooser.closest_ref_is_in_cluster: print('Closest reference', ref_chooser.closest_ref_from_all_refs, 'was not in cluster', file=self.log_fh) self.ref_seq_name = None self.log_fh = None return else: assert ref_chooser.closest_ref_from_all_refs is not None self.ref_seq_name = ref_chooser.closest_ref_from_all_refs print('Closest reference sequence:', self.ref_seq_name, file=self.log_fh) file_reader = pyfastaq.sequences.file_reader(self.ref_fastas) for ref_seq in file_reader: if self.ref_seq_name == ref_seq.id: f_out = pyfastaq.utils.open_file_write(self.ref_fasta) print(ref_seq, file=f_out) pyfastaq.utils.close(f_out) break contigs_both_strands = self._fix_contig_orientation( self.best_assembly_fa, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen) self.has_contigs_on_both_strands = len(contigs_both_strands) > 0 pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences) mapping.run_bowtie2( self.reads1, self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=self.threads, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_version=self.extern_progs.version('bowtie2'), verbose=True, verbose_filehandle=self.log_fh) self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert) print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh) if self.clean: for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']: filename = self.final_assembly_bam + '.' + suffix print('Deleting file', filename, file=self.log_fh) os.unlink(filename) # This is to make this object picklable, to keep multithreading happy self.log_fh = None