Esempio n. 1
0
 def test_run_best_match_not_in_cluster(self):
     '''Test full run where there is a match in cluster, but better match to seq not in cluster'''
     all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_not_in_cluster.allrefs.fa')
     cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_not_in_cluster.clusterrefs.fa')
     contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_not_in_cluster.contigs.fa')
     tmp_out = 'tmp.ref_seq_chooser_full_run_best_match_not_in_cluster.fa'
     refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout)
     refchooser.run()
     self.assertEqual('ref2', refchooser.closest_ref_from_all_refs)
     self.assertFalse(refchooser.closest_ref_is_in_cluster)
     self.assertFalse(os.path.exists(tmp_out))
Esempio n. 2
0
 def test_run_no_nucmer_match(self):
     '''Test full run when there is nearest match in cluster, but no nucmer matches'''
     all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_no_nucmer_match.allrefs.fa')
     cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_no_nucmer_match.clusterrefs.fa')
     contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_no_nucmer_match.contigs.fa')
     tmp_out = 'tmp.ref_seq_chooser_full_run_no_nucmer_match.fa'
     refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout)
     refchooser.run()
     self.assertEqual(None, refchooser.closest_ref_from_all_refs)
     self.assertFalse(refchooser.closest_ref_is_in_cluster)
     self.assertFalse(os.path.exists(tmp_out))
Esempio n. 3
0
 def test_run_contained_ref_seq(self):
     '''Test full run where ref seq completely contains another seq outside cluster'''
     all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.all_refs.fa')
     cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.cluster_refs.fa')
     contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_contained_ref_seq.contigs.fa')
     tmp_out = 'tmp.ref_seq_chooser_full_run_contained_ref_seq.fa'
     refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout)
     refchooser.run()
     self.assertEqual('ref2', refchooser.closest_ref_from_all_refs)
     self.assertTrue(refchooser.closest_ref_is_in_cluster)
     self.assertTrue(os.path.exists(tmp_out))
     os.unlink(tmp_out)
Esempio n. 4
0
 def test_run_best_match_is_in_cluster(self):
     '''Test full run where the best match is in the cluster'''
     all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_is_in_cluster.allrefs.fa')
     cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_is_in_cluster.clusterrefs.fa')
     contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_full_run_best_match_is_in_cluster.contigs.fa')
     tmp_out = 'tmp.ref_seq_chooser_full_run_best_match_is_in_cluster.fa'
     refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout)
     refchooser.run()
     self.assertEqual('ref1', refchooser.closest_ref_from_all_refs)
     self.assertTrue(refchooser.closest_ref_is_in_cluster)
     self.assertTrue(os.path.exists(tmp_out))
     os.unlink(tmp_out)
Esempio n. 5
0
 def test_run_flanking_different(self):
     '''Test full run where amount of flanking seq varies'''
     all_ref_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.all_refs.fa')
     cluster_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.cluster_refs.fa')
     contig_fasta = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.contigs.fa')
     expected_fa = os.path.join(data_dir, 'ref_seq_chooser_test_flanking.expected_contigs.fa')
     tmp_out = 'tmp.ref_seq_chooser_test_flanking.fa'
     refchooser = ref_seq_chooser.RefSeqChooser(cluster_fasta, all_ref_fasta, contig_fasta, tmp_out, sys.stdout)
     refchooser.run()
     self.assertEqual('ref1', refchooser.closest_ref_from_all_refs)
     self.assertTrue(refchooser.closest_ref_is_in_cluster)
     self.assertTrue(filecmp.cmp(expected_fa, tmp_out, shallow=False))
     os.unlink(tmp_out)
Esempio n. 6
0
    def run(self):
        if self.assembler == 'fermilite':
            self._assemble_with_fermilite()
        elif self.assembler == "spades":
            self._assemble_with_spades()
        print('Finished running assemblies', flush=True, file=self.log_fh)
        self.sequences = {}

        # double-check we got some contigs
        number_of_contigs = pyfastaq.tasks.count_sequences(
            self.all_assembly_contigs_fa) if os.path.exists(
                self.all_assembly_contigs_fa) else 0
        if number_of_contigs == 0:
            self.assembled_ok = False
            # This is to make this object picklable, to keep multithreading happy
            self.log_fh = None
            return
        else:
            self.assembled_ok = True

        if self.assembled_ok:
            ref_chooser = ref_seq_chooser.RefSeqChooser(
                self.ref_fastas,
                self.all_reference_fasta,
                self.all_assembly_contigs_fa,
                self.best_assembly_fa,
                self.log_fh,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
            )
            ref_chooser.run()

            if ref_chooser.closest_ref_from_all_refs is None:
                print('Could not find match to reference sequences',
                      file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            elif not ref_chooser.closest_ref_is_in_cluster:
                print('Closest reference',
                      ref_chooser.closest_ref_from_all_refs,
                      'was not in cluster',
                      file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            else:
                assert ref_chooser.closest_ref_from_all_refs is not None
                self.ref_seq_name = ref_chooser.closest_ref_from_all_refs

            print('Closest reference sequence:',
                  self.ref_seq_name,
                  file=self.log_fh)

            file_reader = pyfastaq.sequences.file_reader(self.ref_fastas)
            for ref_seq in file_reader:
                if self.ref_seq_name == ref_seq.id:
                    f_out = pyfastaq.utils.open_file_write(self.ref_fasta)
                    print(ref_seq, file=f_out)
                    pyfastaq.utils.close(f_out)
                    break

            contigs_both_strands = self._fix_contig_orientation(
                self.best_assembly_fa,
                self.ref_fasta,
                self.final_assembly_fa,
                min_id=self.nucmer_min_id,
                min_length=self.nucmer_min_len,
                breaklen=self.nucmer_breaklen)
            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)

            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=self.threads,
                sort=True,
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_version=self.extern_progs.version('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh)

            self.scaff_graph_ok = self._parse_bam(self.sequences,
                                                  self.final_assembly_bam,
                                                  self.min_scaff_depth,
                                                  self.max_insert)
            print('Scaffolding graph is OK:',
                  self.scaff_graph_ok,
                  file=self.log_fh)

            if self.clean:
                for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']:
                    filename = self.final_assembly_bam + '.' + suffix
                    print('Deleting file', filename, file=self.log_fh)
                    os.unlink(filename)

        # This is to make this object picklable, to keep multithreading happy
        self.log_fh = None