Example #1
0
 def test_run_bowtie2_remove_both_unmapped(self):
     '''Test run_bowtie2 unsorted remove both unmapped'''
     self.maxDiff = None
     ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
     reads1 = os.path.join(
         data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_1.fq')
     reads2 = os.path.join(
         data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_2.fq')
     out_prefix = 'tmp.out.bowtie2_remove_both_unmapped'
     mapping.run_bowtie2(
         reads1,
         reads2,
         ref,
         out_prefix,
         bowtie2=extern_progs.exe('bowtie2'),
         bowtie2_version=extern_progs.version('bowtie2'),
         remove_both_unmapped=True,
     )
     expected = get_sam_columns(
         os.path.join(
             data_dir,
             'mapping_test_bowtie2_remove_both_unmapped_reads.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
Example #2
0
    def _total_alignment_score(self, seq_name):
        tmpdir = tempfile.mkdtemp(prefix='tmp.get_total_aln_score.', dir=os.getcwd())
        tmp_bam = os.path.join(tmpdir, 'tmp.get_total_alignment_score.bam')
        tmp_fa = os.path.join(tmpdir, 'tmp.get_total_alignment_score.ref.fa')

        faidx.write_fa_subset(
            [seq_name],
            self.references_fa,
            tmp_fa,
            samtools_exe=self.samtools_exe,
            verbose=True,
            verbose_filehandle=self.log_fh
        )

        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            tmp_fa,
            tmp_bam[:-4],
            threads=self.threads,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=True,
            verbose_filehandle=self.log_fh
        )

        score = mapping.get_total_alignment_score(tmp_bam)
        shutil.rmtree(tmpdir)
        return score
Example #3
0
    def _get_total_alignment_score(self, gene_name):
        tmp_bam = os.path.join(self.root_dir,
                               'tmp.get_total_alignment_score.bam')
        assert not os.path.exists(tmp_bam)
        tmp_fa = os.path.join(self.root_dir,
                              'tmp.get_total_alignment_score.ref.fa')
        assert not os.path.exists(tmp_fa)
        faidx.write_fa_subset([gene_name],
                              self.genes_fa,
                              tmp_fa,
                              samtools_exe=self.samtools_exe,
                              verbose=self.verbose)
        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            tmp_fa,
            tmp_bam[:-4],
            threads=self.threads,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=self.verbose,
        )

        score = mapping.get_total_alignment_score(tmp_bam)
        os.unlink(tmp_bam)
        os.unlink(tmp_fa)
        os.unlink(tmp_fa + '.fai')
        return score
Example #4
0
    def _assemble_with_velvet(self):
        # map reads to reference gene to make BAM input to velvet columbus
        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            self.gene_fa,
            self.gene_bam[:-4],
            threads=self.threads,
            sort=True,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=self.verbose,
        )

        cmd = ' '.join([
            self.velveth, self.assembler_dir,
            str(self.assembly_kmer), '-reference', self.gene_fa,
            '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam'
        ])

        cwd = os.getcwd()
        os.chdir(self.assembly_dir)
        velvet_contigs = os.path.join(
            os.path.split(self.assembler_dir)[1], 'contigs.fa')

        self.velveth_ok, err = common.syscall(cmd,
                                              verbose=self.verbose,
                                              allow_fail=True)
        if not self.velveth_ok:
            with open('velveth_errors', 'w') as f:
                print(err, file=f)
                f.close()
            self.status_flag.add('assembly_fail')
            os.chdir(cwd)
            return

        cmd = ' '.join([
            self.velvetg,
            self.assembler_dir,
            '-ins_length',
            str(int(self.reads_insert)),
            '-scaffolding no',
            '-exp_cov auto',
            '-very_clean yes',
            '-cov_cutoff auto',
        ])

        self.assembled_ok, err = common.syscall(cmd,
                                                verbose=self.verbose,
                                                allow_fail=True)
        if self.assembled_ok:
            os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs))
        else:
            with open('velvetg_errors', 'w') as f:
                print(err, file=f)
                f.close()
            self.status_flag.add('assembly_fail')

        os.chdir(cwd)
Example #5
0
    def _assemble_with_velvet(self):
        # map reads to reference gene to make BAM input to velvet columbus
        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            self.gene_fa,
            self.gene_bam[:-4],
            threads=self.threads,
            sort=True,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=self.verbose,
        )

        cmd = ' '.join([
            self.velveth,
            self.assembler_dir,
            str(self.assembly_kmer),
            '-reference', self.gene_fa,
            '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam'
        ])

        cwd = os.getcwd()
        os.chdir(self.assembly_dir)
        velvet_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'contigs.fa')

        self.velveth_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
        if not self.velveth_ok:
            with open('velveth_errors', 'w') as f:
                print(err, file=f)
                f.close()
            self.status_flag.add('assembly_fail')
            os.chdir(cwd)
            return

        cmd = ' '.join([
            self.velvetg,
            self.assembler_dir,
            '-ins_length', str(int(self.reads_insert)),
            '-scaffolding no',
            '-exp_cov auto',
            '-very_clean yes',
            '-cov_cutoff auto',
        ])

        self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True)
        if self.assembled_ok:
            os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs))
        else:
            with open('velvetg_errors', 'w') as f:
                print(err, file=f)
                f.close()
            self.status_flag.add('assembly_fail')

        os.chdir(cwd)
Example #6
0
    def run(self):
        self._assemble_with_spades()
        self.sequences = {}

        # double-check we got some contigs
        number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs) if os.path.exists(self.assembly_contigs) else 0
        if number_of_contigs == 0:
            self.assembled_ok = False
            # This is to make this object picklable, to keep multithreading happy
            self.log_fh = None
            return
        else:
            self.assembled_ok = True

        if self.assembled_ok:
            self._scaffold_with_sspace()
            self._gap_fill_with_gapfiller()

            pyfastaq.tasks.filter(self.gapfilled_scaffolds, self.gapfilled_length_filtered, minlength=self.min_scaff_length)
            if pyfastaq.tasks.count_sequences(self.gapfilled_length_filtered) == 0:
                self.assembled_ok = False
                # This is to make this object picklable, to keep multithreading happy
                self.log_fh = None
                return

            contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)
            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)

            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=1,
                sort=True,
                samtools=self.extern_progs.exe('samtools'),
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_preset=self.bowtie2_preset,
                verbose=True,
                verbose_filehandle=self.log_fh
            )

            self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert)
            print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh)

            if self.clean:
                for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']:
                    filename = self.final_assembly_bam + '.' + suffix
                    print('Deleting file', filename, file=self.log_fh)
                    os.unlink(filename)


        # This is to make this object picklable, to keep multithreading happy
        self.log_fh = None
Example #7
0
    def run(self):
        self.gene = self._choose_best_gene()
        if self.gene is None:
            self.assembled_ok = False
        else:
            if self.assembler == 'velvet':
                self._assemble_with_velvet()
            elif self.assembler == 'spades':
                self._assemble_with_spades()

        # velvet can finish successfully, but make an empty contigs file
        if self.assembled_ok:
            number_of_contigs = pyfastaq.tasks.count_sequences(
                self.assembly_contigs)
            if number_of_contigs == 0:
                self.assembled_ok = False
                self.status_flag.add('assembly_fail')

        if self.assembled_ok:
            # finish the assembly
            self._scaffold_with_sspace()
            self._gap_fill_with_gapfiller()
            self._fix_contig_orientation()
            self._load_final_contigs()

            # map reads to assembly
            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=self.threads,
                sort=True,
                samtools=self.samtools_exe,
                bowtie2=self.bowtie2_exe,
                bowtie2_preset=self.bowtie2_preset,
                verbose=self.verbose,
            )
            self._parse_assembly_bam()

            # compare gene and assembly
            self._run_nucmer(self.final_assembly_fa,
                             self.assembly_vs_gene_coords,
                             show_snps=True)
            self._parse_assembly_vs_gene_coords()
            self._nucmer_hits_to_percent_identity()
            self._get_mummer_variants()
            self._filter_mummer_variants()
            self._update_flag_from_nucmer_file()
            self._make_assembly_vcf()
            self._get_vcf_variant_counts()

        self._make_report_lines()
        self._clean()
Example #8
0
 def test_run_bowtie2(self):
     """Test run_bowtie2 unsorted"""
     self.maxDiff = None
     ref = os.path.join(data_dir, "mapping_test_bowtie2_ref.fa")
     reads1 = os.path.join(data_dir, "mapping_test_bowtie2_reads_1.fq")
     reads2 = os.path.join(data_dir, "mapping_test_bowtie2_reads_2.fq")
     out_prefix = "tmp.out.bowtie2"
     mapping.run_bowtie2(reads1, reads2, ref, out_prefix)
     expected = get_sam_columns(os.path.join(data_dir, "mapping_test_bowtie2_unsorted.bam"))
     got = get_sam_columns(out_prefix + ".bam")
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + ".bam")
Example #9
0
 def _map_reads_to_clustered_genes(self):
     mapping.run_bowtie2(
         self.reads_1,
         self.reads_2,
         self.db_fasta_clustered,
         self.bam_prefix,
         threads=self.threads,
         samtools=self.samtools_exe,
         bowtie2=self.bowtie2_exe,
         bowtie2_preset=self.bowtie2_preset,
         verbose=self.verbose,
     )
Example #10
0
 def _map_reads_to_clustered_genes(self):
     mapping.run_bowtie2(
         self.reads_1,
         self.reads_2,
         self.db_fasta_clustered,
         self.bam_prefix,
         threads=self.threads,
         samtools=self.samtools_exe,
         bowtie2=self.bowtie2_exe,
         bowtie2_preset=self.bowtie2_preset,
         verbose=self.verbose,
     )
Example #11
0
    def run(self):
        self.gene = self._choose_best_gene()
        if self.gene is None:
            self.assembled_ok = False
        else:
            if self.assembler == 'velvet':
                self._assemble_with_velvet()
            elif self.assembler == 'spades':
                self._assemble_with_spades()

        # velvet can finish successfully, but make an empty contigs file
        if self.assembled_ok:
            number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs)
            if number_of_contigs == 0:
                self.assembled_ok = False
                self.status_flag.add('assembly_fail')

        if self.assembled_ok:
            # finish the assembly
            self._scaffold_with_sspace()
            self._gap_fill_with_gapfiller()
            self._fix_contig_orientation()
            self._load_final_contigs()

            # map reads to assembly
            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=self.threads,
                sort=True,
                samtools=self.samtools_exe,
                bowtie2=self.bowtie2_exe,
                bowtie2_preset=self.bowtie2_preset,
                verbose=self.verbose,
            )
            self._parse_assembly_bam()


            # compare gene and assembly
            self._run_nucmer(self.final_assembly_fa, self.assembly_vs_gene_coords, show_snps=True)
            self._parse_assembly_vs_gene_coords()
            self._nucmer_hits_to_percent_identity()
            self._get_mummer_variants()
            self._filter_mummer_variants()
            self._update_flag_from_nucmer_file()
            self._make_assembly_vcf()
            self._get_vcf_variant_counts()
            self._nucmer_hits_to_assembled_gene_sequences(self.nucmer_hits, self.gene, self.final_assembly, self.final_assembled_genes_fa)

        self._make_report_lines()
        self._clean()
Example #12
0
 def test_run_bowtie2(self):
     '''Test run_bowtie2 unsorted'''
     self.maxDiff = None
     ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
     reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
     reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
     out_prefix = 'tmp.out.bowtie2'
     mapping.run_bowtie2(reads1, reads2, ref, out_prefix)
     expected = get_sam_columns(
         os.path.join(data_dir, 'mapping_test_bowtie2_unsorted.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
Example #13
0
 def _map_reads_to_clustered_genes(self):
     mapping.run_bowtie2(
         self.reads_1,
         self.reads_2,
         self.cdhit_cluster_representatives_fa,
         self.bam_prefix,
         threads=self.threads,
         samtools=self.extern_progs.exe('samtools'),
         bowtie2=self.extern_progs.exe('bowtie2'),
         bowtie2_preset=self.bowtie2_preset,
         verbose=self.verbose,
         remove_both_unmapped=True,
     )
Example #14
0
 def test_run_bowtie2(self):
     '''Test run_bowtie2 unsorted'''
     self.maxDiff = None
     ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
     reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
     reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
     out_prefix = 'tmp.out.bowtie2'
     mapping.run_bowtie2(
         reads1,
         reads2,
         ref,
         out_prefix,
         samtools=extern_progs.exe('samtools'),
         bowtie2=extern_progs.exe('bowtie2'),
     )
     expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_unsorted.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
Example #15
0
 def test_run_bowtie2_and_sort(self):
     '''Test run_bowtie2 sorted'''
     ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
     reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
     reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
     out_prefix = 'tmp.out.bowtie2'
     mapping.run_bowtie2(
         reads1,
         reads2,
         ref,
         out_prefix,
         sort=True,
         bowtie2=extern_progs.exe('bowtie2'),
     )
     expected = get_sam_columns(
         os.path.join(data_dir, 'mapping_test_bowtie2_sorted.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
     os.unlink(out_prefix + '.bam.bai')
Example #16
0
 def test_run_bowtie2_remove_both_unmapped(self):
     '''Test run_bowtie2 unsorted remove both unmapped'''
     self.maxDiff = None
     ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
     reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_1.fq')
     reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_2.fq')
     out_prefix = 'tmp.out.bowtie2_remove_both_unmapped'
     mapping.run_bowtie2(
         reads1,
         reads2,
         ref,
         out_prefix,
         bowtie2=extern_progs.exe('bowtie2'),
         bowtie2_version=extern_progs.version('bowtie2'),
         remove_both_unmapped=True,
     )
     expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
Example #17
0
 def test_run_bowtie2_and_sort(self):
     '''Test run_bowtie2 sorted'''
     ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
     reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
     reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
     out_prefix = 'tmp.out.bowtie2'
     mapping.run_bowtie2(
         reads1,
         reads2,
         ref,
         out_prefix,
         sort=True,
         bowtie2=extern_progs.exe('bowtie2'),
         bowtie2_version=extern_progs.version('bowtie2'),
     )
     expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_sorted.bam'))
     got = get_sam_columns(out_prefix + '.bam')
     self.assertListEqual(expected, got)
     os.unlink(out_prefix + '.bam')
     os.unlink(out_prefix + '.bam.bai')
Example #18
0
    def _get_total_alignment_score(self, gene_name):
        tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam')
        assert not os.path.exists(tmp_bam)
        tmp_fa = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.ref.fa')
        assert not os.path.exists(tmp_fa)
        faidx.write_fa_subset([gene_name], self.genes_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=self.verbose)
        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            tmp_fa,
            tmp_bam[:-4],
            threads=self.threads,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=self.verbose,
        )

        score = mapping.get_total_alignment_score(tmp_bam)
        os.unlink(tmp_bam)
        os.unlink(tmp_fa)
        os.unlink(tmp_fa + '.fai')
        return score
Example #19
0
    def _run(self):
        print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True)

        if self.total_reads == 0:
            print('No reads left after filtering with cdhit', file=self.log_fh, flush=True)
            self.assembled_ok = False
        else:
            wanted_reads = self._number_of_reads_for_assembly(self.longest_ref_length, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage)
            made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed)
            print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True)
            print('Assembling reads:', file=self.log_fh, flush=True)

            self.assembly = assembly.Assembly(
              self.reads_for_assembly1,
              self.reads_for_assembly2,
              self.reference_fa,
              self.references_fa,
              self.assembly_dir,
              self.final_assembly_fa,
              self.final_assembly_bam,
              self.log_fh,
              self.all_refs_fasta,
              contig_name_prefix=self.name,
              assembler=self.assembler,
              extern_progs=self.extern_progs,
              clean=self.clean
            )

            self.assembly.run()
            self.assembled_ok = self.assembly.assembled_ok
            self._clean_file(self.reads_for_assembly1)
            self._clean_file(self.reads_for_assembly2)
            if self.clean:
                print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True)
                shutil.rmtree(self.assembly_dir)


        if self.assembled_ok and self.assembly.ref_seq_name is not None:
            self.ref_sequence = self.refdata.sequence(self.assembly.ref_seq_name)
            is_gene, is_variant_only = self.refdata.sequence_type(self.ref_sequence.id)
            self.is_gene = '1' if is_gene == 'p' else '0'
            self.is_variant_only = '1' if is_variant_only else '0'

            print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True)

            mapping.run_bowtie2(
                self.all_reads1,
                self.all_reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=1,
                sort=True,
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_preset='very-sensitive-local',
                bowtie2_version=self.extern_progs.version('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh
            )

            if self.assembly.has_contigs_on_both_strands:
                self.status_flag.add('hit_both_strands')

            print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True)

            if not self.assembly.scaff_graph_ok:
                self.status_flag.add('scaffold_graph_bad')

            print('Comparing assembly against reference sequence', file=self.log_fh, flush=True)
            self.assembly_compare = assembly_compare.AssemblyCompare(
              self.final_assembly_fa,
              self.assembly.sequences,
              self.reference_fa,
              self.ref_sequence,
              self.assembly_compare_prefix,
              self.refdata,
              nucmer_min_id=self.nucmer_min_id,
              nucmer_min_len=self.nucmer_min_len,
              nucmer_breaklen=self.nucmer_breaklen,
              assembled_threshold=self.assembled_threshold,
              unique_threshold=self.unique_threshold,
              max_gene_nt_extend=self.max_gene_nt_extend,
            )
            self.assembly_compare.run()
            self.status_flag = self.assembly_compare.update_flag(self.status_flag)

            allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(self.assembly_compare.nucmer_hits)
            assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file)
            self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos)

            for var_list in self.assembly_variants.values():
                for var in var_list:
                    if var[3] not in ['.', 'SYN', None]:
                        self.status_flag.add('has_variant')
                        break

                if self.status_flag.has('has_variant'):
                    break


            print('\nCalling variants with samtools:', file=self.log_fh, flush=True)

            self.samtools_vars = samtools_variants.SamtoolsVariants(
                self.final_assembly_fa,
                self.final_assembly_bam,
                self.samtools_vars_prefix,
                log_fh=self.log_fh,
                min_var_read_depth=self.min_var_read_depth,
                min_second_var_read_depth=self.min_second_var_read_depth,
                max_allele_freq=self.max_allele_freq
            )
            self.samtools_vars.run()

            self.total_contig_depths = self.samtools_vars.total_depth_per_contig(self.samtools_vars.contig_depths_file)

            self.variants_from_samtools =  self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file)
            if len(self.variants_from_samtools):
                self.status_flag.add('variants_suggest_collapsed_repeat')
        elif not self.assembled_ok:
            print('\nAssembly failed\n', file=self.log_fh, flush=True)
            self.status_flag.add('assembly_fail')
        elif self.assembly.ref_seq_name is None:
            print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True)
            self.status_flag.add('ref_seq_choose_fail')

        try:
            self.report_lines = report.report_lines(self)
        except:
            print('Error making report for cluster ', self.name, '... traceback:', file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            raise Error('Error making report for cluster ' + self.name)

        self._clean()
        atexit.unregister(self._atexit)
Example #20
0
    def _run(self):
        print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True)

        print('Choosing best reference sequence:', file=self.log_fh, flush=True)
        seq_chooser = best_seq_chooser.BestSeqChooser(
            self.all_reads1,
            self.all_reads2,
            self.references_fa,
            self.log_fh,
            samtools_exe=self.extern_progs.exe('samtools'),
            bowtie2_exe=self.extern_progs.exe('bowtie2'),
            bowtie2_preset=self.bowtie2_preset,
            threads=1,
        )
        self.ref_sequence = seq_chooser.best_seq(self.reference_fa)
        self._clean_file(self.references_fa)
        self._clean_file(self.references_fa + '.fai')

        if self.ref_sequence is None:
            self.status_flag.add('ref_seq_choose_fail')
            self.assembled_ok = False
        else:
            wanted_reads = self._number_of_reads_for_assembly(self.reference_fa, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage)
            made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed)
            print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True)
            print('Assembling reads:', file=self.log_fh, flush=True)
            self.ref_sequence_type = self.refdata.sequence_type(self.ref_sequence.id)
            assert self.ref_sequence_type is not None
            self.assembly = assembly.Assembly(
              self.reads_for_assembly1,
              self.reads_for_assembly2,
              self.reference_fa,
              self.assembly_dir,
              self.final_assembly_fa,
              self.final_assembly_bam,
              self.log_fh,
              scaff_name_prefix=self.ref_sequence.id,
              kmer=self.assembly_kmer,
              assembler=self.assembler,
              spades_other_options=self.spades_other_options,
              sspace_k=self.sspace_k,
              sspace_sd=self.sspace_sd,
              reads_insert=self.reads_insert,
              extern_progs=self.extern_progs,
              clean=self.clean
            )

            self.assembly.run()
            self.assembled_ok = self.assembly.assembled_ok
            self._clean_file(self.reads_for_assembly1)
            self._clean_file(self.reads_for_assembly2)
            if self.clean:
                print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True)
                shutil.rmtree(self.assembly_dir)

        if self.assembled_ok:
            print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True)

            mapping.run_bowtie2(
                self.all_reads1,
                self.all_reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=1,
                sort=True,
                samtools=self.extern_progs.exe('samtools'),
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_preset=self.bowtie2_preset,
                verbose=True,
                verbose_filehandle=self.log_fh
            )

            if self.assembly.has_contigs_on_both_strands:
                self.status_flag.add('hit_both_strands')

            print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True)

            if not self.assembly.scaff_graph_ok:
                self.status_flag.add('scaffold_graph_bad')

            print('Comparing assembly against reference sequence', file=self.log_fh, flush=True)
            self.assembly_compare = assembly_compare.AssemblyCompare(
              self.final_assembly_fa,
              self.assembly.sequences,
              self.reference_fa,
              self.ref_sequence,
              self.assembly_compare_prefix,
              self.refdata,
              nucmer_min_id=self.nucmer_min_id,
              nucmer_min_len=self.nucmer_min_len,
              nucmer_breaklen=self.nucmer_breaklen,
              assembled_threshold=self.assembled_threshold,
              unique_threshold=self.unique_threshold,
              max_gene_nt_extend=self.max_gene_nt_extend,
            )
            self.assembly_compare.run()
            self.status_flag = self.assembly_compare.update_flag(self.status_flag)

            nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits)
            assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file)
            self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref)

            for var_list in self.assembly_variants.values():
                for var in var_list:
                    if var[3] not in ['.', 'SYN', None]:
                        self.status_flag.add('has_nonsynonymous_variants')
                        break

                if self.status_flag.has('has_nonsynonymous_variants'):
                    break


            print('\nCalling variants with samtools:', file=self.log_fh, flush=True)

            self.samtools_vars = samtools_variants.SamtoolsVariants(
                self.final_assembly_fa,
                self.final_assembly_bam,
                self.samtools_vars_prefix,
                log_fh=self.log_fh,
                samtools_exe=self.extern_progs.exe('samtools'),
                bcftools_exe=self.extern_progs.exe('bcftools'),
                bcf_min_dp=self.bcf_min_dp,
                bcf_min_dv=self.bcf_min_dv,
                bcf_min_dv_over_dp=self.bcf_min_dv_over_dp,
                bcf_min_qual=self.bcf_min_qual,
            )
            self.samtools_vars.run()

            self.total_contig_depths = self.samtools_vars.total_depth_per_contig(self.samtools_vars.read_depths_file)

            if self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file):
                self.status_flag.add('variants_suggest_collapsed_repeat')
        else:
            print('\nAssembly failed\n', file=self.log_fh, flush=True)
            self.status_flag.add('assembly_fail')


        print('\nMaking report lines', file=self.log_fh, flush=True)
        self.report_lines = report.report_lines(self)
        self._clean()
        atexit.unregister(self._atexit)
Example #21
0
    def _run(self):
        print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '),
              file=self.log_fh,
              flush=True)

        if self.total_reads == 0:
            print('No reads left after filtering with cdhit',
                  file=self.log_fh,
                  flush=True)
            self.assembled_ok = False
        else:
            wanted_reads = self._number_of_reads_for_assembly(
                self.longest_ref_length, self.reads_insert,
                self.total_reads_bases, self.total_reads,
                self.assembly_coverage)
            made_reads = self._make_reads_for_assembly(
                wanted_reads,
                self.total_reads,
                self.all_reads1,
                self.all_reads2,
                self.reads_for_assembly1,
                self.reads_for_assembly2,
                random_seed=self.random_seed)
            print('\nUsing',
                  made_reads,
                  'from a total of',
                  self.total_reads,
                  'for assembly.',
                  file=self.log_fh,
                  flush=True)
            print('Assembling reads:', file=self.log_fh, flush=True)

            self._update_threads()
            self.assembly = assembly.Assembly(
                self.reads_for_assembly1,
                self.reads_for_assembly2,
                self.reference_fa,
                self.references_fa,
                self.assembly_dir,
                self.final_assembly_fa,
                self.final_assembly_bam,
                self.log_fh,
                self.all_refs_fasta,
                contig_name_prefix=self.name,
                assembler=self.assembler,
                extern_progs=self.extern_progs,
                clean=self.clean,
                spades_mode=self.spades_mode,
                spades_options=self.spades_options,
                threads=self.threads)

            self.assembly.run()
            self.assembled_ok = self.assembly.assembled_ok
            self._clean_file(self.reads_for_assembly1)
            self._clean_file(self.reads_for_assembly2)
            if self.clean:
                print('Deleting Assembly directory',
                      self.assembly_dir,
                      file=self.log_fh,
                      flush=True)
                shutil.rmtree(self.assembly_dir, ignore_errors=True)

        if self.assembled_ok and self.assembly.ref_seq_name is not None:
            self.ref_sequence = self.refdata.sequence(
                self.assembly.ref_seq_name)
            is_gene, is_variant_only = self.refdata.sequence_type(
                self.ref_sequence.id)
            self.is_gene = '1' if is_gene == 'p' else '0'
            self.is_variant_only = '1' if is_variant_only else '0'

            print('\nAssembly was successful\n\nMapping reads to assembly:',
                  file=self.log_fh,
                  flush=True)
            self._update_threads()
            mapping.run_bowtie2(
                self.all_reads1,
                self.all_reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=self.threads,
                sort=True,
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_preset='very-sensitive-local',
                bowtie2_version=self.extern_progs.version('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh)

            if self.assembly.has_contigs_on_both_strands:
                self.status_flag.add('hit_both_strands')

            print('\nMaking and checking scaffold graph',
                  file=self.log_fh,
                  flush=True)

            if not self.assembly.scaff_graph_ok:
                self.status_flag.add('scaffold_graph_bad')

            print('Comparing assembly against reference sequence',
                  file=self.log_fh,
                  flush=True)
            self.assembly_compare = assembly_compare.AssemblyCompare(
                self.final_assembly_fa,
                self.assembly.sequences,
                self.reference_fa,
                self.ref_sequence,
                self.assembly_compare_prefix,
                self.refdata,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
                assembled_threshold=self.assembled_threshold,
                unique_threshold=self.unique_threshold,
                max_gene_nt_extend=self.max_gene_nt_extend,
            )
            self.assembly_compare.run()
            self.status_flag = self.assembly_compare.update_flag(
                self.status_flag)

            allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(
                self.assembly_compare.nucmer_hits)
            assembly_variants_obj = assembly_variants.AssemblyVariants(
                self.refdata, self.assembly_compare.nucmer_snps_file)
            self.assembly_variants = assembly_variants_obj.get_variants(
                self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos)

            for var_list in self.assembly_variants.values():
                for var in var_list:
                    if var[3] not in ['.', 'SYN', None]:
                        self.status_flag.add('has_variant')
                        break

                if self.status_flag.has('has_variant'):
                    break

            print('\nCalling variants with samtools:',
                  file=self.log_fh,
                  flush=True)

            self.samtools_vars = samtools_variants.SamtoolsVariants(
                self.final_assembly_fa,
                self.final_assembly_bam,
                self.samtools_vars_prefix,
                log_fh=self.log_fh,
                min_var_read_depth=self.min_var_read_depth,
                min_second_var_read_depth=self.min_second_var_read_depth,
                max_allele_freq=self.max_allele_freq)
            self.samtools_vars.run()

            self.total_contig_depths = self.samtools_vars.total_depth_per_contig(
                self.samtools_vars.contig_depths_file)

            self.variants_from_samtools = self.samtools_vars.variants_in_coords(
                self.assembly_compare.assembly_match_coords(),
                self.samtools_vars.vcf_file)
            if len(self.variants_from_samtools):
                self.status_flag.add('variants_suggest_collapsed_repeat')
        elif not self.assembled_ok:
            print('\nAssembly failed\n', file=self.log_fh, flush=True)
            self.status_flag.add('assembly_fail')
        elif self.assembly.ref_seq_name is None:
            print('\nCould not get closest reference sequence\n',
                  file=self.log_fh,
                  flush=True)
            self.status_flag.add('ref_seq_choose_fail')

        try:
            self.report_lines = report.report_lines(self)
        except:
            print('Error making report for cluster ',
                  self.name,
                  '... traceback:',
                  file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            raise Error('Error making report for cluster ' + self.name)

        self._clean()
        atexit.unregister(self._atexit)
Example #22
0
    def run(self):
        if self.assembler == 'fermilite':
            self._assemble_with_fermilite()
        elif self.assembler == "spades":
            self._assemble_with_spades()
        print('Finished running assemblies', flush=True, file=self.log_fh)
        self.sequences = {}

        # double-check we got some contigs
        number_of_contigs = pyfastaq.tasks.count_sequences(
            self.all_assembly_contigs_fa) if os.path.exists(
                self.all_assembly_contigs_fa) else 0
        if number_of_contigs == 0:
            self.assembled_ok = False
            # This is to make this object picklable, to keep multithreading happy
            self.log_fh = None
            return
        else:
            self.assembled_ok = True

        if self.assembled_ok:
            ref_chooser = ref_seq_chooser.RefSeqChooser(
                self.ref_fastas,
                self.all_reference_fasta,
                self.all_assembly_contigs_fa,
                self.best_assembly_fa,
                self.log_fh,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
            )
            ref_chooser.run()

            if ref_chooser.closest_ref_from_all_refs is None:
                print('Could not find match to reference sequences',
                      file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            elif not ref_chooser.closest_ref_is_in_cluster:
                print('Closest reference',
                      ref_chooser.closest_ref_from_all_refs,
                      'was not in cluster',
                      file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            else:
                assert ref_chooser.closest_ref_from_all_refs is not None
                self.ref_seq_name = ref_chooser.closest_ref_from_all_refs

            print('Closest reference sequence:',
                  self.ref_seq_name,
                  file=self.log_fh)

            file_reader = pyfastaq.sequences.file_reader(self.ref_fastas)
            for ref_seq in file_reader:
                if self.ref_seq_name == ref_seq.id:
                    f_out = pyfastaq.utils.open_file_write(self.ref_fasta)
                    print(ref_seq, file=f_out)
                    pyfastaq.utils.close(f_out)
                    break

            contigs_both_strands = self._fix_contig_orientation(
                self.best_assembly_fa,
                self.ref_fasta,
                self.final_assembly_fa,
                min_id=self.nucmer_min_id,
                min_length=self.nucmer_min_len,
                breaklen=self.nucmer_breaklen)
            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)

            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=self.threads,
                sort=True,
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_version=self.extern_progs.version('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh)

            self.scaff_graph_ok = self._parse_bam(self.sequences,
                                                  self.final_assembly_bam,
                                                  self.min_scaff_depth,
                                                  self.max_insert)
            print('Scaffolding graph is OK:',
                  self.scaff_graph_ok,
                  file=self.log_fh)

            if self.clean:
                for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']:
                    filename = self.final_assembly_bam + '.' + suffix
                    print('Deleting file', filename, file=self.log_fh)
                    os.unlink(filename)

        # This is to make this object picklable, to keep multithreading happy
        self.log_fh = None
Example #23
0
    def run(self):
        self._assemble_with_fermilite()
        print('Finished running assemblies', flush=True, file=self.log_fh)
        self.sequences = {}

        # double-check we got some contigs
        number_of_contigs = pyfastaq.tasks.count_sequences(self.all_assembly_contigs_fa) if os.path.exists(self.all_assembly_contigs_fa) else 0
        if number_of_contigs == 0:
            self.assembled_ok = False
            # This is to make this object picklable, to keep multithreading happy
            self.log_fh = None
            return
        else:
            self.assembled_ok = True

        if self.assembled_ok:
            ref_chooser = ref_seq_chooser.RefSeqChooser(
                self.ref_fastas,
                self.all_reference_fasta,
                self.all_assembly_contigs_fa,
                self.best_assembly_fa,
                self.log_fh,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
            )
            ref_chooser.run()

            if ref_chooser.closest_ref_from_all_refs is None:
                print('Could not find match to reference sequences', file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            elif not ref_chooser.closest_ref_is_in_cluster:
                print('Closest reference', ref_chooser.closest_ref_from_all_refs, 'was not in cluster', file=self.log_fh)
                self.ref_seq_name = None
                self.log_fh = None
                return
            else:
                assert ref_chooser.closest_ref_from_all_refs is not None
                self.ref_seq_name = ref_chooser.closest_ref_from_all_refs

            print('Closest reference sequence:', self.ref_seq_name, file=self.log_fh)

            file_reader = pyfastaq.sequences.file_reader(self.ref_fastas)
            for ref_seq in file_reader:
                if self.ref_seq_name == ref_seq.id:
                    f_out = pyfastaq.utils.open_file_write(self.ref_fasta)
                    print(ref_seq, file=f_out)
                    pyfastaq.utils.close(f_out)
                    break

            contigs_both_strands = self._fix_contig_orientation(self.best_assembly_fa, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)
            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)

            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=1,
                sort=True,
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_version=self.extern_progs.version('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh
            )

            self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert)
            print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh)

            if self.clean:
                for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']:
                    filename = self.final_assembly_bam + '.' + suffix
                    print('Deleting file', filename, file=self.log_fh)
                    os.unlink(filename)


        # This is to make this object picklable, to keep multithreading happy
        self.log_fh = None