def test_run_bowtie2_remove_both_unmapped(self): '''Test run_bowtie2 unsorted remove both unmapped''' self.maxDiff = None ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa') reads1 = os.path.join( data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_1.fq') reads2 = os.path.join( data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_2.fq') out_prefix = 'tmp.out.bowtie2_remove_both_unmapped' mapping.run_bowtie2( reads1, reads2, ref, out_prefix, bowtie2=extern_progs.exe('bowtie2'), bowtie2_version=extern_progs.version('bowtie2'), remove_both_unmapped=True, ) expected = get_sam_columns( os.path.join( data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam')
def _total_alignment_score(self, seq_name): tmpdir = tempfile.mkdtemp(prefix='tmp.get_total_aln_score.', dir=os.getcwd()) tmp_bam = os.path.join(tmpdir, 'tmp.get_total_alignment_score.bam') tmp_fa = os.path.join(tmpdir, 'tmp.get_total_alignment_score.ref.fa') faidx.write_fa_subset( [seq_name], self.references_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh ) mapping.run_bowtie2( self.reads1, self.reads2, tmp_fa, tmp_bam[:-4], threads=self.threads, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=True, verbose_filehandle=self.log_fh ) score = mapping.get_total_alignment_score(tmp_bam) shutil.rmtree(tmpdir) return score
def _get_total_alignment_score(self, gene_name): tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam') assert not os.path.exists(tmp_bam) tmp_fa = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.ref.fa') assert not os.path.exists(tmp_fa) faidx.write_fa_subset([gene_name], self.genes_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=self.verbose) mapping.run_bowtie2( self.reads1, self.reads2, tmp_fa, tmp_bam[:-4], threads=self.threads, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, ) score = mapping.get_total_alignment_score(tmp_bam) os.unlink(tmp_bam) os.unlink(tmp_fa) os.unlink(tmp_fa + '.fai') return score
def _assemble_with_velvet(self): # map reads to reference gene to make BAM input to velvet columbus mapping.run_bowtie2( self.reads1, self.reads2, self.gene_fa, self.gene_bam[:-4], threads=self.threads, sort=True, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, ) cmd = ' '.join([ self.velveth, self.assembler_dir, str(self.assembly_kmer), '-reference', self.gene_fa, '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam' ]) cwd = os.getcwd() os.chdir(self.assembly_dir) velvet_contigs = os.path.join( os.path.split(self.assembler_dir)[1], 'contigs.fa') self.velveth_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if not self.velveth_ok: with open('velveth_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd) return cmd = ' '.join([ self.velvetg, self.assembler_dir, '-ins_length', str(int(self.reads_insert)), '-scaffolding no', '-exp_cov auto', '-very_clean yes', '-cov_cutoff auto', ]) self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if self.assembled_ok: os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs)) else: with open('velvetg_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd)
def _assemble_with_velvet(self): # map reads to reference gene to make BAM input to velvet columbus mapping.run_bowtie2( self.reads1, self.reads2, self.gene_fa, self.gene_bam[:-4], threads=self.threads, sort=True, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, ) cmd = ' '.join([ self.velveth, self.assembler_dir, str(self.assembly_kmer), '-reference', self.gene_fa, '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam' ]) cwd = os.getcwd() os.chdir(self.assembly_dir) velvet_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'contigs.fa') self.velveth_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if not self.velveth_ok: with open('velveth_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd) return cmd = ' '.join([ self.velvetg, self.assembler_dir, '-ins_length', str(int(self.reads_insert)), '-scaffolding no', '-exp_cov auto', '-very_clean yes', '-cov_cutoff auto', ]) self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if self.assembled_ok: os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs)) else: with open('velvetg_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd)
def run(self): self._assemble_with_spades() self.sequences = {} # double-check we got some contigs number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs) if os.path.exists(self.assembly_contigs) else 0 if number_of_contigs == 0: self.assembled_ok = False # This is to make this object picklable, to keep multithreading happy self.log_fh = None return else: self.assembled_ok = True if self.assembled_ok: self._scaffold_with_sspace() self._gap_fill_with_gapfiller() pyfastaq.tasks.filter(self.gapfilled_scaffolds, self.gapfilled_length_filtered, minlength=self.min_scaff_length) if pyfastaq.tasks.count_sequences(self.gapfilled_length_filtered) == 0: self.assembled_ok = False # This is to make this object picklable, to keep multithreading happy self.log_fh = None return contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen) self.has_contigs_on_both_strands = len(contigs_both_strands) > 0 pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences) mapping.run_bowtie2( self.reads1, self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=1, sort=True, samtools=self.extern_progs.exe('samtools'), bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset=self.bowtie2_preset, verbose=True, verbose_filehandle=self.log_fh ) self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert) print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh) if self.clean: for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']: filename = self.final_assembly_bam + '.' + suffix print('Deleting file', filename, file=self.log_fh) os.unlink(filename) # This is to make this object picklable, to keep multithreading happy self.log_fh = None
def run(self): self.gene = self._choose_best_gene() if self.gene is None: self.assembled_ok = False else: if self.assembler == 'velvet': self._assemble_with_velvet() elif self.assembler == 'spades': self._assemble_with_spades() # velvet can finish successfully, but make an empty contigs file if self.assembled_ok: number_of_contigs = pyfastaq.tasks.count_sequences( self.assembly_contigs) if number_of_contigs == 0: self.assembled_ok = False self.status_flag.add('assembly_fail') if self.assembled_ok: # finish the assembly self._scaffold_with_sspace() self._gap_fill_with_gapfiller() self._fix_contig_orientation() self._load_final_contigs() # map reads to assembly mapping.run_bowtie2( self.reads1, self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=self.threads, sort=True, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, ) self._parse_assembly_bam() # compare gene and assembly self._run_nucmer(self.final_assembly_fa, self.assembly_vs_gene_coords, show_snps=True) self._parse_assembly_vs_gene_coords() self._nucmer_hits_to_percent_identity() self._get_mummer_variants() self._filter_mummer_variants() self._update_flag_from_nucmer_file() self._make_assembly_vcf() self._get_vcf_variant_counts() self._make_report_lines() self._clean()
def test_run_bowtie2(self): """Test run_bowtie2 unsorted""" self.maxDiff = None ref = os.path.join(data_dir, "mapping_test_bowtie2_ref.fa") reads1 = os.path.join(data_dir, "mapping_test_bowtie2_reads_1.fq") reads2 = os.path.join(data_dir, "mapping_test_bowtie2_reads_2.fq") out_prefix = "tmp.out.bowtie2" mapping.run_bowtie2(reads1, reads2, ref, out_prefix) expected = get_sam_columns(os.path.join(data_dir, "mapping_test_bowtie2_unsorted.bam")) got = get_sam_columns(out_prefix + ".bam") self.assertListEqual(expected, got) os.unlink(out_prefix + ".bam")
def _map_reads_to_clustered_genes(self): mapping.run_bowtie2( self.reads_1, self.reads_2, self.db_fasta_clustered, self.bam_prefix, threads=self.threads, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, )
def run(self): self.gene = self._choose_best_gene() if self.gene is None: self.assembled_ok = False else: if self.assembler == 'velvet': self._assemble_with_velvet() elif self.assembler == 'spades': self._assemble_with_spades() # velvet can finish successfully, but make an empty contigs file if self.assembled_ok: number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs) if number_of_contigs == 0: self.assembled_ok = False self.status_flag.add('assembly_fail') if self.assembled_ok: # finish the assembly self._scaffold_with_sspace() self._gap_fill_with_gapfiller() self._fix_contig_orientation() self._load_final_contigs() # map reads to assembly mapping.run_bowtie2( self.reads1, self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=self.threads, sort=True, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, ) self._parse_assembly_bam() # compare gene and assembly self._run_nucmer(self.final_assembly_fa, self.assembly_vs_gene_coords, show_snps=True) self._parse_assembly_vs_gene_coords() self._nucmer_hits_to_percent_identity() self._get_mummer_variants() self._filter_mummer_variants() self._update_flag_from_nucmer_file() self._make_assembly_vcf() self._get_vcf_variant_counts() self._nucmer_hits_to_assembled_gene_sequences(self.nucmer_hits, self.gene, self.final_assembly, self.final_assembled_genes_fa) self._make_report_lines() self._clean()
def test_run_bowtie2(self): '''Test run_bowtie2 unsorted''' self.maxDiff = None ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa') reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq') reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq') out_prefix = 'tmp.out.bowtie2' mapping.run_bowtie2(reads1, reads2, ref, out_prefix) expected = get_sam_columns( os.path.join(data_dir, 'mapping_test_bowtie2_unsorted.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam')
def _map_reads_to_clustered_genes(self): mapping.run_bowtie2( self.reads_1, self.reads_2, self.cdhit_cluster_representatives_fa, self.bam_prefix, threads=self.threads, samtools=self.extern_progs.exe('samtools'), bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, remove_both_unmapped=True, )
def test_run_bowtie2(self): '''Test run_bowtie2 unsorted''' self.maxDiff = None ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa') reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq') reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq') out_prefix = 'tmp.out.bowtie2' mapping.run_bowtie2( reads1, reads2, ref, out_prefix, samtools=extern_progs.exe('samtools'), bowtie2=extern_progs.exe('bowtie2'), ) expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_unsorted.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam')
def test_run_bowtie2_and_sort(self): '''Test run_bowtie2 sorted''' ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa') reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq') reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq') out_prefix = 'tmp.out.bowtie2' mapping.run_bowtie2( reads1, reads2, ref, out_prefix, sort=True, bowtie2=extern_progs.exe('bowtie2'), ) expected = get_sam_columns( os.path.join(data_dir, 'mapping_test_bowtie2_sorted.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam') os.unlink(out_prefix + '.bam.bai')
def test_run_bowtie2_remove_both_unmapped(self): '''Test run_bowtie2 unsorted remove both unmapped''' self.maxDiff = None ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa') reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_1.fq') reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_2.fq') out_prefix = 'tmp.out.bowtie2_remove_both_unmapped' mapping.run_bowtie2( reads1, reads2, ref, out_prefix, bowtie2=extern_progs.exe('bowtie2'), bowtie2_version=extern_progs.version('bowtie2'), remove_both_unmapped=True, ) expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam')
def test_run_bowtie2_and_sort(self): '''Test run_bowtie2 sorted''' ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa') reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq') reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq') out_prefix = 'tmp.out.bowtie2' mapping.run_bowtie2( reads1, reads2, ref, out_prefix, sort=True, bowtie2=extern_progs.exe('bowtie2'), bowtie2_version=extern_progs.version('bowtie2'), ) expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_sorted.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam') os.unlink(out_prefix + '.bam.bai')
def _run(self): print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True) if self.total_reads == 0: print('No reads left after filtering with cdhit', file=self.log_fh, flush=True) self.assembled_ok = False else: wanted_reads = self._number_of_reads_for_assembly(self.longest_ref_length, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage) made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed) print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True) print('Assembling reads:', file=self.log_fh, flush=True) self.assembly = assembly.Assembly( self.reads_for_assembly1, self.reads_for_assembly2, self.reference_fa, self.references_fa, self.assembly_dir, self.final_assembly_fa, self.final_assembly_bam, self.log_fh, self.all_refs_fasta, contig_name_prefix=self.name, assembler=self.assembler, extern_progs=self.extern_progs, clean=self.clean ) self.assembly.run() self.assembled_ok = self.assembly.assembled_ok self._clean_file(self.reads_for_assembly1) self._clean_file(self.reads_for_assembly2) if self.clean: print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True) shutil.rmtree(self.assembly_dir) if self.assembled_ok and self.assembly.ref_seq_name is not None: self.ref_sequence = self.refdata.sequence(self.assembly.ref_seq_name) is_gene, is_variant_only = self.refdata.sequence_type(self.ref_sequence.id) self.is_gene = '1' if is_gene == 'p' else '0' self.is_variant_only = '1' if is_variant_only else '0' print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True) mapping.run_bowtie2( self.all_reads1, self.all_reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=1, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset='very-sensitive-local', bowtie2_version=self.extern_progs.version('bowtie2'), verbose=True, verbose_filehandle=self.log_fh ) if self.assembly.has_contigs_on_both_strands: self.status_flag.add('hit_both_strands') print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True) if not self.assembly.scaff_graph_ok: self.status_flag.add('scaffold_graph_bad') print('Comparing assembly against reference sequence', file=self.log_fh, flush=True) self.assembly_compare = assembly_compare.AssemblyCompare( self.final_assembly_fa, self.assembly.sequences, self.reference_fa, self.ref_sequence, self.assembly_compare_prefix, self.refdata, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, max_gene_nt_extend=self.max_gene_nt_extend, ) self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag(self.status_flag) allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file) self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos) for var_list in self.assembly_variants.values(): for var in var_list: if var[3] not in ['.', 'SYN', None]: self.status_flag.add('has_variant') break if self.status_flag.has('has_variant'): break print('\nCalling variants with samtools:', file=self.log_fh, flush=True) self.samtools_vars = samtools_variants.SamtoolsVariants( self.final_assembly_fa, self.final_assembly_bam, self.samtools_vars_prefix, log_fh=self.log_fh, min_var_read_depth=self.min_var_read_depth, min_second_var_read_depth=self.min_second_var_read_depth, max_allele_freq=self.max_allele_freq ) self.samtools_vars.run() self.total_contig_depths = self.samtools_vars.total_depth_per_contig(self.samtools_vars.contig_depths_file) self.variants_from_samtools = self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file) if len(self.variants_from_samtools): self.status_flag.add('variants_suggest_collapsed_repeat') elif not self.assembled_ok: print('\nAssembly failed\n', file=self.log_fh, flush=True) self.status_flag.add('assembly_fail') elif self.assembly.ref_seq_name is None: print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True) self.status_flag.add('ref_seq_choose_fail') try: self.report_lines = report.report_lines(self) except: print('Error making report for cluster ', self.name, '... traceback:', file=sys.stderr) traceback.print_exc(file=sys.stderr) raise Error('Error making report for cluster ' + self.name) self._clean() atexit.unregister(self._atexit)
def _run(self): print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True) print('Choosing best reference sequence:', file=self.log_fh, flush=True) seq_chooser = best_seq_chooser.BestSeqChooser( self.all_reads1, self.all_reads2, self.references_fa, self.log_fh, samtools_exe=self.extern_progs.exe('samtools'), bowtie2_exe=self.extern_progs.exe('bowtie2'), bowtie2_preset=self.bowtie2_preset, threads=1, ) self.ref_sequence = seq_chooser.best_seq(self.reference_fa) self._clean_file(self.references_fa) self._clean_file(self.references_fa + '.fai') if self.ref_sequence is None: self.status_flag.add('ref_seq_choose_fail') self.assembled_ok = False else: wanted_reads = self._number_of_reads_for_assembly(self.reference_fa, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage) made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed) print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True) print('Assembling reads:', file=self.log_fh, flush=True) self.ref_sequence_type = self.refdata.sequence_type(self.ref_sequence.id) assert self.ref_sequence_type is not None self.assembly = assembly.Assembly( self.reads_for_assembly1, self.reads_for_assembly2, self.reference_fa, self.assembly_dir, self.final_assembly_fa, self.final_assembly_bam, self.log_fh, scaff_name_prefix=self.ref_sequence.id, kmer=self.assembly_kmer, assembler=self.assembler, spades_other_options=self.spades_other_options, sspace_k=self.sspace_k, sspace_sd=self.sspace_sd, reads_insert=self.reads_insert, extern_progs=self.extern_progs, clean=self.clean ) self.assembly.run() self.assembled_ok = self.assembly.assembled_ok self._clean_file(self.reads_for_assembly1) self._clean_file(self.reads_for_assembly2) if self.clean: print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True) shutil.rmtree(self.assembly_dir) if self.assembled_ok: print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True) mapping.run_bowtie2( self.all_reads1, self.all_reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=1, sort=True, samtools=self.extern_progs.exe('samtools'), bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset=self.bowtie2_preset, verbose=True, verbose_filehandle=self.log_fh ) if self.assembly.has_contigs_on_both_strands: self.status_flag.add('hit_both_strands') print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True) if not self.assembly.scaff_graph_ok: self.status_flag.add('scaffold_graph_bad') print('Comparing assembly against reference sequence', file=self.log_fh, flush=True) self.assembly_compare = assembly_compare.AssemblyCompare( self.final_assembly_fa, self.assembly.sequences, self.reference_fa, self.ref_sequence, self.assembly_compare_prefix, self.refdata, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, max_gene_nt_extend=self.max_gene_nt_extend, ) self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag(self.status_flag) nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file) self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref) for var_list in self.assembly_variants.values(): for var in var_list: if var[3] not in ['.', 'SYN', None]: self.status_flag.add('has_nonsynonymous_variants') break if self.status_flag.has('has_nonsynonymous_variants'): break print('\nCalling variants with samtools:', file=self.log_fh, flush=True) self.samtools_vars = samtools_variants.SamtoolsVariants( self.final_assembly_fa, self.final_assembly_bam, self.samtools_vars_prefix, log_fh=self.log_fh, samtools_exe=self.extern_progs.exe('samtools'), bcftools_exe=self.extern_progs.exe('bcftools'), bcf_min_dp=self.bcf_min_dp, bcf_min_dv=self.bcf_min_dv, bcf_min_dv_over_dp=self.bcf_min_dv_over_dp, bcf_min_qual=self.bcf_min_qual, ) self.samtools_vars.run() self.total_contig_depths = self.samtools_vars.total_depth_per_contig(self.samtools_vars.read_depths_file) if self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file): self.status_flag.add('variants_suggest_collapsed_repeat') else: print('\nAssembly failed\n', file=self.log_fh, flush=True) self.status_flag.add('assembly_fail') print('\nMaking report lines', file=self.log_fh, flush=True) self.report_lines = report.report_lines(self) self._clean() atexit.unregister(self._atexit)
def _run(self): print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True) if self.total_reads == 0: print('No reads left after filtering with cdhit', file=self.log_fh, flush=True) self.assembled_ok = False else: wanted_reads = self._number_of_reads_for_assembly( self.longest_ref_length, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage) made_reads = self._make_reads_for_assembly( wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed) print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True) print('Assembling reads:', file=self.log_fh, flush=True) self._update_threads() self.assembly = assembly.Assembly( self.reads_for_assembly1, self.reads_for_assembly2, self.reference_fa, self.references_fa, self.assembly_dir, self.final_assembly_fa, self.final_assembly_bam, self.log_fh, self.all_refs_fasta, contig_name_prefix=self.name, assembler=self.assembler, extern_progs=self.extern_progs, clean=self.clean, spades_mode=self.spades_mode, spades_options=self.spades_options, threads=self.threads) self.assembly.run() self.assembled_ok = self.assembly.assembled_ok self._clean_file(self.reads_for_assembly1) self._clean_file(self.reads_for_assembly2) if self.clean: print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True) shutil.rmtree(self.assembly_dir, ignore_errors=True) if self.assembled_ok and self.assembly.ref_seq_name is not None: self.ref_sequence = self.refdata.sequence( self.assembly.ref_seq_name) is_gene, is_variant_only = self.refdata.sequence_type( self.ref_sequence.id) self.is_gene = '1' if is_gene == 'p' else '0' self.is_variant_only = '1' if is_variant_only else '0' print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True) self._update_threads() mapping.run_bowtie2( self.all_reads1, self.all_reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=self.threads, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset='very-sensitive-local', bowtie2_version=self.extern_progs.version('bowtie2'), verbose=True, verbose_filehandle=self.log_fh) if self.assembly.has_contigs_on_both_strands: self.status_flag.add('hit_both_strands') print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True) if not self.assembly.scaff_graph_ok: self.status_flag.add('scaffold_graph_bad') print('Comparing assembly against reference sequence', file=self.log_fh, flush=True) self.assembly_compare = assembly_compare.AssemblyCompare( self.final_assembly_fa, self.assembly.sequences, self.reference_fa, self.ref_sequence, self.assembly_compare_prefix, self.refdata, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, max_gene_nt_extend=self.max_gene_nt_extend, ) self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag( self.status_flag) allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords( self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants( self.refdata, self.assembly_compare.nucmer_snps_file) self.assembly_variants = assembly_variants_obj.get_variants( self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos) for var_list in self.assembly_variants.values(): for var in var_list: if var[3] not in ['.', 'SYN', None]: self.status_flag.add('has_variant') break if self.status_flag.has('has_variant'): break print('\nCalling variants with samtools:', file=self.log_fh, flush=True) self.samtools_vars = samtools_variants.SamtoolsVariants( self.final_assembly_fa, self.final_assembly_bam, self.samtools_vars_prefix, log_fh=self.log_fh, min_var_read_depth=self.min_var_read_depth, min_second_var_read_depth=self.min_second_var_read_depth, max_allele_freq=self.max_allele_freq) self.samtools_vars.run() self.total_contig_depths = self.samtools_vars.total_depth_per_contig( self.samtools_vars.contig_depths_file) self.variants_from_samtools = self.samtools_vars.variants_in_coords( self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file) if len(self.variants_from_samtools): self.status_flag.add('variants_suggest_collapsed_repeat') elif not self.assembled_ok: print('\nAssembly failed\n', file=self.log_fh, flush=True) self.status_flag.add('assembly_fail') elif self.assembly.ref_seq_name is None: print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True) self.status_flag.add('ref_seq_choose_fail') try: self.report_lines = report.report_lines(self) except: print('Error making report for cluster ', self.name, '... traceback:', file=sys.stderr) traceback.print_exc(file=sys.stderr) raise Error('Error making report for cluster ' + self.name) self._clean() atexit.unregister(self._atexit)
def run(self): if self.assembler == 'fermilite': self._assemble_with_fermilite() elif self.assembler == "spades": self._assemble_with_spades() print('Finished running assemblies', flush=True, file=self.log_fh) self.sequences = {} # double-check we got some contigs number_of_contigs = pyfastaq.tasks.count_sequences( self.all_assembly_contigs_fa) if os.path.exists( self.all_assembly_contigs_fa) else 0 if number_of_contigs == 0: self.assembled_ok = False # This is to make this object picklable, to keep multithreading happy self.log_fh = None return else: self.assembled_ok = True if self.assembled_ok: ref_chooser = ref_seq_chooser.RefSeqChooser( self.ref_fastas, self.all_reference_fasta, self.all_assembly_contigs_fa, self.best_assembly_fa, self.log_fh, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, ) ref_chooser.run() if ref_chooser.closest_ref_from_all_refs is None: print('Could not find match to reference sequences', file=self.log_fh) self.ref_seq_name = None self.log_fh = None return elif not ref_chooser.closest_ref_is_in_cluster: print('Closest reference', ref_chooser.closest_ref_from_all_refs, 'was not in cluster', file=self.log_fh) self.ref_seq_name = None self.log_fh = None return else: assert ref_chooser.closest_ref_from_all_refs is not None self.ref_seq_name = ref_chooser.closest_ref_from_all_refs print('Closest reference sequence:', self.ref_seq_name, file=self.log_fh) file_reader = pyfastaq.sequences.file_reader(self.ref_fastas) for ref_seq in file_reader: if self.ref_seq_name == ref_seq.id: f_out = pyfastaq.utils.open_file_write(self.ref_fasta) print(ref_seq, file=f_out) pyfastaq.utils.close(f_out) break contigs_both_strands = self._fix_contig_orientation( self.best_assembly_fa, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen) self.has_contigs_on_both_strands = len(contigs_both_strands) > 0 pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences) mapping.run_bowtie2( self.reads1, self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=self.threads, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_version=self.extern_progs.version('bowtie2'), verbose=True, verbose_filehandle=self.log_fh) self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert) print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh) if self.clean: for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']: filename = self.final_assembly_bam + '.' + suffix print('Deleting file', filename, file=self.log_fh) os.unlink(filename) # This is to make this object picklable, to keep multithreading happy self.log_fh = None
def run(self): self._assemble_with_fermilite() print('Finished running assemblies', flush=True, file=self.log_fh) self.sequences = {} # double-check we got some contigs number_of_contigs = pyfastaq.tasks.count_sequences(self.all_assembly_contigs_fa) if os.path.exists(self.all_assembly_contigs_fa) else 0 if number_of_contigs == 0: self.assembled_ok = False # This is to make this object picklable, to keep multithreading happy self.log_fh = None return else: self.assembled_ok = True if self.assembled_ok: ref_chooser = ref_seq_chooser.RefSeqChooser( self.ref_fastas, self.all_reference_fasta, self.all_assembly_contigs_fa, self.best_assembly_fa, self.log_fh, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, ) ref_chooser.run() if ref_chooser.closest_ref_from_all_refs is None: print('Could not find match to reference sequences', file=self.log_fh) self.ref_seq_name = None self.log_fh = None return elif not ref_chooser.closest_ref_is_in_cluster: print('Closest reference', ref_chooser.closest_ref_from_all_refs, 'was not in cluster', file=self.log_fh) self.ref_seq_name = None self.log_fh = None return else: assert ref_chooser.closest_ref_from_all_refs is not None self.ref_seq_name = ref_chooser.closest_ref_from_all_refs print('Closest reference sequence:', self.ref_seq_name, file=self.log_fh) file_reader = pyfastaq.sequences.file_reader(self.ref_fastas) for ref_seq in file_reader: if self.ref_seq_name == ref_seq.id: f_out = pyfastaq.utils.open_file_write(self.ref_fasta) print(ref_seq, file=f_out) pyfastaq.utils.close(f_out) break contigs_both_strands = self._fix_contig_orientation(self.best_assembly_fa, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen) self.has_contigs_on_both_strands = len(contigs_both_strands) > 0 pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences) mapping.run_bowtie2( self.reads1, self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=1, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_version=self.extern_progs.version('bowtie2'), verbose=True, verbose_filehandle=self.log_fh ) self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert) print('Scaffolding graph is OK:', self.scaff_graph_ok, file=self.log_fh) if self.clean: for suffix in ['soft_clipped', 'unmapped_mates', 'scaff']: filename = self.final_assembly_bam + '.' + suffix print('Deleting file', filename, file=self.log_fh) os.unlink(filename) # This is to make this object picklable, to keep multithreading happy self.log_fh = None