Ejemplo n.º 1
0
    def _total_alignment_score(self, seq_name):
        tmpdir = tempfile.mkdtemp(prefix='tmp.get_total_aln_score.', dir=os.getcwd())
        tmp_bam = os.path.join(tmpdir, 'tmp.get_total_alignment_score.bam')
        tmp_fa = os.path.join(tmpdir, 'tmp.get_total_alignment_score.ref.fa')

        faidx.write_fa_subset(
            [seq_name],
            self.references_fa,
            tmp_fa,
            samtools_exe=self.samtools_exe,
            verbose=True,
            verbose_filehandle=self.log_fh
        )

        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            tmp_fa,
            tmp_bam[:-4],
            threads=self.threads,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=True,
            verbose_filehandle=self.log_fh
        )

        score = mapping.get_total_alignment_score(tmp_bam)
        shutil.rmtree(tmpdir)
        return score
Ejemplo n.º 2
0
    def _get_total_alignment_score(self, gene_name):
        tmp_bam = os.path.join(self.root_dir,
                               'tmp.get_total_alignment_score.bam')
        assert not os.path.exists(tmp_bam)
        tmp_fa = os.path.join(self.root_dir,
                              'tmp.get_total_alignment_score.ref.fa')
        assert not os.path.exists(tmp_fa)
        faidx.write_fa_subset([gene_name],
                              self.genes_fa,
                              tmp_fa,
                              samtools_exe=self.samtools_exe,
                              verbose=self.verbose)
        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            tmp_fa,
            tmp_bam[:-4],
            threads=self.threads,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=self.verbose,
        )

        score = mapping.get_total_alignment_score(tmp_bam)
        os.unlink(tmp_bam)
        os.unlink(tmp_fa)
        os.unlink(tmp_fa + '.fai')
        return score
Ejemplo n.º 3
0
 def test_write_fa_subset(self):
     '''test write_fa_subset'''
     infile = os.path.join(data_dir, 'faidx_test_write_fa_subset.in.fa')
     expected = os.path.join(data_dir, 'faidx_test_write_fa_subset.out.fa')
     tmpfile = 'tmp.test_write_fa_subset.out.fa'
     faidx.write_fa_subset(['seq1', 'seq3', 'seq4'], infile, tmpfile)
     self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
     os.unlink(tmpfile)
Ejemplo n.º 4
0
 def _choose_best_gene(self):
     gene_name = self._get_best_gene_by_alignment_score()
     if gene_name is None:
         return None
     faidx.write_fa_subset([gene_name], self.genes_fa, self.gene_fa, samtools_exe=self.samtools_exe, verbose=self.verbose)
     seqs = {}
     pyfastaq.tasks.file_to_dict(self.gene_fa, seqs)
     assert len(seqs) == 1
     return list(seqs.values())[0]
Ejemplo n.º 5
0
 def best_seq(self, outfile):
     '''Finds the closest matchng sequence, writes it to a FASTA file, and returns it as a pyfastaq.sequences.Fasta object'''
     seq_name = self._get_best_seq_by_alignment_score()
     if seq_name is None:
         return None
     faidx.write_fa_subset([seq_name], self.references_fa, outfile, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh)
     seqs = {}
     pyfastaq.tasks.file_to_dict(outfile, seqs)
     assert len(seqs) == 1
     return list(seqs.values())[0]
Ejemplo n.º 6
0
 def _choose_best_gene(self):
     gene_name = self._get_best_gene_by_alignment_score()
     if gene_name is None:
         return None
     faidx.write_fa_subset([gene_name],
                           self.genes_fa,
                           self.gene_fa,
                           samtools_exe=self.samtools_exe,
                           verbose=self.verbose)
     seqs = {}
     pyfastaq.tasks.file_to_dict(self.gene_fa, seqs)
     assert len(seqs) == 1
     return list(seqs.values())[0]
Ejemplo n.º 7
0
    def _init_and_run_clusters(self):
        if len(self.cluster_to_dir) == 0:
            raise Error('Did not get any reads mapped to genes. Cannot continue')

        counter = 0

        for gene in sorted(self.cluster_to_dir):
            counter += 1
            if self.verbose:
                print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir)))
            new_dir = self.cluster_to_dir[gene]

            faidx.write_fa_subset(
                self.cluster_ids[gene],
                self.db_fasta,
                os.path.join(new_dir, 'genes.fa'),
                samtools_exe=self.samtools_exe,
                verbose=self.verbose
            )

            self.clusters[gene] = cluster.Cluster(
                new_dir,
                gene,
                assembly_kmer=self.assembly_kmer,
                assembler=self.assembler,
                max_insert=self.insert_proper_pair_max,
                min_scaff_depth=self.min_scaff_depth,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
                sspace_k=self.min_scaff_depth,
                reads_insert=self.insert_size,
                sspace_sd=self.insert_sspace_sd,
                threads=self.threads,
                assembled_threshold=self.assembled_threshold,
                unique_threshold=self.unique_threshold,
                verbose=self.verbose,
                bcftools_exe=self.bcftools_exe,
                gapfiller_exe=self.gapfiller_exe,
                samtools_exe=self.samtools_exe,
                bowtie2_exe=self.bowtie2_exe,
                bowtie2_preset=self.bowtie2_preset,
                spades_exe=self.spades_exe,
                sspace_exe=self.sspace_exe,
                velvet_exe=self.velvet,
                spades_other=self.spades_other,
                clean=self.clean,
            )

            self.clusters[gene].run()
Ejemplo n.º 8
0
    def _init_and_run_clusters(self):
        if len(self.cluster_to_dir) == 0:
            raise Error('Did not get any reads mapped to genes. Cannot continue')

        counter = 0

        for gene in sorted(self.cluster_to_dir):
            counter += 1
            if self.verbose:
                print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir)))
            new_dir = self.cluster_to_dir[gene]

            faidx.write_fa_subset(
                self.cluster_ids[gene],
                self.db_fasta,
                os.path.join(new_dir, 'genes.fa'),
                samtools_exe=self.samtools_exe,
                verbose=self.verbose
            )

            self.clusters[gene] = cluster.Cluster(
                new_dir,
                gene,
                assembly_kmer=self.assembly_kmer,
                assembler=self.assembler,
                max_insert=self.insert_proper_pair_max,
                min_scaff_depth=self.min_scaff_depth,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
                sspace_k=self.min_scaff_depth,
                reads_insert=self.insert_size,
                sspace_sd=self.insert_sspace_sd,
                threads=self.threads,
                assembled_threshold=self.assembled_threshold,
                unique_threshold=self.unique_threshold,
                verbose=self.verbose,
                bcftools_exe=self.bcftools_exe,
                gapfiller_exe=self.gapfiller_exe,
                samtools_exe=self.samtools_exe,
                bowtie2_exe=self.bowtie2_exe,
                bowtie2_preset=self.bowtie2_preset,
                spades_exe=self.spades_exe,
                sspace_exe=self.sspace_exe,
                velvet_exe=self.velvet,
                spades_other=self.spades_other,
                clean=self.clean,
            )

            self.clusters[gene].run()
Ejemplo n.º 9
0
    def _get_total_alignment_score(self, gene_name):
        tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam')
        assert not os.path.exists(tmp_bam)
        tmp_fa = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.ref.fa')
        assert not os.path.exists(tmp_fa)
        faidx.write_fa_subset([gene_name], self.genes_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=self.verbose)
        mapping.run_bowtie2(
            self.reads1,
            self.reads2,
            tmp_fa,
            tmp_bam[:-4],
            threads=self.threads,
            samtools=self.samtools_exe,
            bowtie2=self.bowtie2_exe,
            bowtie2_preset=self.bowtie2_preset,
            verbose=self.verbose,
        )

        score = mapping.get_total_alignment_score(tmp_bam)
        os.unlink(tmp_bam)
        os.unlink(tmp_fa)
        os.unlink(tmp_fa + '.fai')
        return score
Ejemplo n.º 10
0
    def run(self):
        self._assemble_with_fermilite()
        self.sequences = {}

        # double-check we got some contigs
        number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs) if os.path.exists(self.assembly_contigs) else 0
        if number_of_contigs == 0:
            self.assembled_ok = False
            # This is to make this object picklable, to keep multithreading happy
            self.log_fh = None
            return
        else:
            self.assembled_ok = True

        if self.assembled_ok:
            self._scaffold_with_sspace()
            self._gap_fill_with_gapfiller()

            pyfastaq.tasks.filter(self.gapfilled_scaffolds, self.gapfilled_length_filtered, minlength=self.min_scaff_length)
            if pyfastaq.tasks.count_sequences(self.gapfilled_length_filtered) == 0:
                self.assembled_ok = False
                # This is to make this object picklable, to keep multithreading happy
                self.log_fh = None
                return

            masher = mash.Masher(self.ref_fastas, self.gapfilled_length_filtered, self.log_fh, self.extern_progs)
            self.ref_seq_name = masher.run(self.mash_dist_file)
            if self.ref_seq_name is None:
                print('Could not determine closest reference sequence', file=self.log_fh)
                self.log_fh = None
                return

            faidx.write_fa_subset({self.ref_seq_name}, self.ref_fastas, self.ref_fasta, samtools_exe=self.extern_progs.exe('samtools'), verbose=True, verbose_filehandle=self.log_fh)
            print('Closest reference sequence according to mash: ', self.ref_seq_name, file=self.log_fh)

            contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)
            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)

            mapping.run_bowtie2(
                self.reads1,
                self.reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=1,
                sort=True,
                samtools=self.extern_progs.exe('samtools'),
                bowtie2=self.extern_progs.exe('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh
            )

            self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert)
            print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh)

            if self.clean:
                for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']:
                    filename = self.final_assembly_bam + '.' + suffix
                    print('Deleting file', filename, file=self.log_fh)
                    os.unlink(filename)


        # This is to make this object picklable, to keep multithreading happy
        self.log_fh = None