def _run(self): print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True) print('Choosing best reference sequence:', file=self.log_fh, flush=True) seq_chooser = best_seq_chooser.BestSeqChooser( self.all_reads1, self.all_reads2, self.references_fa, self.log_fh, samtools_exe=self.extern_progs.exe('samtools'), bowtie2_exe=self.extern_progs.exe('bowtie2'), bowtie2_preset=self.bowtie2_preset, threads=1, ) self.ref_sequence = seq_chooser.best_seq(self.reference_fa) self._clean_file(self.references_fa) self._clean_file(self.references_fa + '.fai') if self.ref_sequence is None: self.status_flag.add('ref_seq_choose_fail') self.assembled_ok = False else: wanted_reads = self._number_of_reads_for_assembly(self.reference_fa, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage) made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed) print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True) print('Assembling reads:', file=self.log_fh, flush=True) self.ref_sequence_type = self.refdata.sequence_type(self.ref_sequence.id) assert self.ref_sequence_type is not None self.assembly = assembly.Assembly( self.reads_for_assembly1, self.reads_for_assembly2, self.reference_fa, self.assembly_dir, self.final_assembly_fa, self.final_assembly_bam, self.log_fh, scaff_name_prefix=self.ref_sequence.id, kmer=self.assembly_kmer, assembler=self.assembler, spades_other_options=self.spades_other_options, sspace_k=self.sspace_k, sspace_sd=self.sspace_sd, reads_insert=self.reads_insert, extern_progs=self.extern_progs, clean=self.clean ) self.assembly.run() self.assembled_ok = self.assembly.assembled_ok self._clean_file(self.reads_for_assembly1) self._clean_file(self.reads_for_assembly2) if self.clean: print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True) shutil.rmtree(self.assembly_dir) if self.assembled_ok: print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True) mapping.run_bowtie2( self.all_reads1, self.all_reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=1, sort=True, samtools=self.extern_progs.exe('samtools'), bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset=self.bowtie2_preset, verbose=True, verbose_filehandle=self.log_fh ) if self.assembly.has_contigs_on_both_strands: self.status_flag.add('hit_both_strands') print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True) if not self.assembly.scaff_graph_ok: self.status_flag.add('scaffold_graph_bad') print('Comparing assembly against reference sequence', file=self.log_fh, flush=True) self.assembly_compare = assembly_compare.AssemblyCompare( self.final_assembly_fa, self.assembly.sequences, self.reference_fa, self.ref_sequence, self.assembly_compare_prefix, self.refdata, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, max_gene_nt_extend=self.max_gene_nt_extend, ) self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag(self.status_flag) nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file) self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref) for var_list in self.assembly_variants.values(): for var in var_list: if var[3] not in ['.', 'SYN', None]: self.status_flag.add('has_nonsynonymous_variants') break if self.status_flag.has('has_nonsynonymous_variants'): break print('\nCalling variants with samtools:', file=self.log_fh, flush=True) self.samtools_vars = samtools_variants.SamtoolsVariants( self.final_assembly_fa, self.final_assembly_bam, self.samtools_vars_prefix, log_fh=self.log_fh, samtools_exe=self.extern_progs.exe('samtools'), bcftools_exe=self.extern_progs.exe('bcftools'), bcf_min_dp=self.bcf_min_dp, bcf_min_dv=self.bcf_min_dv, bcf_min_dv_over_dp=self.bcf_min_dv_over_dp, bcf_min_qual=self.bcf_min_qual, ) self.samtools_vars.run() self.total_contig_depths = self.samtools_vars.total_depth_per_contig(self.samtools_vars.read_depths_file) if self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file): self.status_flag.add('variants_suggest_collapsed_repeat') else: print('\nAssembly failed\n', file=self.log_fh, flush=True) self.status_flag.add('assembly_fail') print('\nMaking report lines', file=self.log_fh, flush=True) self.report_lines = report.report_lines(self) self._clean() atexit.unregister(self._atexit)
def _run(self): print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True) if self.total_reads == 0: print('No reads left after filtering with cdhit', file=self.log_fh, flush=True) self.assembled_ok = False else: wanted_reads = self._number_of_reads_for_assembly( self.longest_ref_length, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage) made_reads = self._make_reads_for_assembly( wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed) print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True) print('Assembling reads:', file=self.log_fh, flush=True) self._update_threads() self.assembly = assembly.Assembly( self.reads_for_assembly1, self.reads_for_assembly2, self.reference_fa, self.references_fa, self.assembly_dir, self.final_assembly_fa, self.final_assembly_bam, self.log_fh, self.all_refs_fasta, contig_name_prefix=self.name, assembler=self.assembler, extern_progs=self.extern_progs, clean=self.clean, spades_mode=self.spades_mode, spades_options=self.spades_options, threads=self.threads) self.assembly.run() self.assembled_ok = self.assembly.assembled_ok self._clean_file(self.reads_for_assembly1) self._clean_file(self.reads_for_assembly2) if self.clean: print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True) shutil.rmtree(self.assembly_dir, ignore_errors=True) if self.assembled_ok and self.assembly.ref_seq_name is not None: self.ref_sequence = self.refdata.sequence( self.assembly.ref_seq_name) is_gene, is_variant_only = self.refdata.sequence_type( self.ref_sequence.id) self.is_gene = '1' if is_gene == 'p' else '0' self.is_variant_only = '1' if is_variant_only else '0' print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True) self._update_threads() mapping.run_bowtie2( self.all_reads1, self.all_reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=self.threads, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset='very-sensitive-local', bowtie2_version=self.extern_progs.version('bowtie2'), verbose=True, verbose_filehandle=self.log_fh) if self.assembly.has_contigs_on_both_strands: self.status_flag.add('hit_both_strands') print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True) if not self.assembly.scaff_graph_ok: self.status_flag.add('scaffold_graph_bad') print('Comparing assembly against reference sequence', file=self.log_fh, flush=True) self.assembly_compare = assembly_compare.AssemblyCompare( self.final_assembly_fa, self.assembly.sequences, self.reference_fa, self.ref_sequence, self.assembly_compare_prefix, self.refdata, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, max_gene_nt_extend=self.max_gene_nt_extend, ) self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag( self.status_flag) allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords( self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants( self.refdata, self.assembly_compare.nucmer_snps_file) self.assembly_variants = assembly_variants_obj.get_variants( self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos) for var_list in self.assembly_variants.values(): for var in var_list: if var[3] not in ['.', 'SYN', None]: self.status_flag.add('has_variant') break if self.status_flag.has('has_variant'): break print('\nCalling variants with samtools:', file=self.log_fh, flush=True) self.samtools_vars = samtools_variants.SamtoolsVariants( self.final_assembly_fa, self.final_assembly_bam, self.samtools_vars_prefix, log_fh=self.log_fh, min_var_read_depth=self.min_var_read_depth, min_second_var_read_depth=self.min_second_var_read_depth, max_allele_freq=self.max_allele_freq) self.samtools_vars.run() self.total_contig_depths = self.samtools_vars.total_depth_per_contig( self.samtools_vars.contig_depths_file) self.variants_from_samtools = self.samtools_vars.variants_in_coords( self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file) if len(self.variants_from_samtools): self.status_flag.add('variants_suggest_collapsed_repeat') elif not self.assembled_ok: print('\nAssembly failed\n', file=self.log_fh, flush=True) self.status_flag.add('assembly_fail') elif self.assembly.ref_seq_name is None: print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True) self.status_flag.add('ref_seq_choose_fail') try: self.report_lines = report.report_lines(self) except: print('Error making report for cluster ', self.name, '... traceback:', file=sys.stderr) traceback.print_exc(file=sys.stderr) raise Error('Error making report for cluster ' + self.name) self._clean() atexit.unregister(self._atexit)
def _run(self): print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True) if self.total_reads == 0: print('No reads left after filtering with cdhit', file=self.log_fh, flush=True) self.assembled_ok = False else: wanted_reads = self._number_of_reads_for_assembly(self.longest_ref_length, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage) made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed) print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True) print('Assembling reads:', file=self.log_fh, flush=True) self.assembly = assembly.Assembly( self.reads_for_assembly1, self.reads_for_assembly2, self.reference_fa, self.references_fa, self.assembly_dir, self.final_assembly_fa, self.final_assembly_bam, self.log_fh, self.all_refs_fasta, contig_name_prefix=self.name, assembler=self.assembler, extern_progs=self.extern_progs, clean=self.clean ) self.assembly.run() self.assembled_ok = self.assembly.assembled_ok self._clean_file(self.reads_for_assembly1) self._clean_file(self.reads_for_assembly2) if self.clean: print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True) shutil.rmtree(self.assembly_dir) if self.assembled_ok and self.assembly.ref_seq_name is not None: self.ref_sequence = self.refdata.sequence(self.assembly.ref_seq_name) is_gene, is_variant_only = self.refdata.sequence_type(self.ref_sequence.id) self.is_gene = '1' if is_gene == 'p' else '0' self.is_variant_only = '1' if is_variant_only else '0' print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True) mapping.run_bowtie2( self.all_reads1, self.all_reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], threads=1, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset='very-sensitive-local', bowtie2_version=self.extern_progs.version('bowtie2'), verbose=True, verbose_filehandle=self.log_fh ) if self.assembly.has_contigs_on_both_strands: self.status_flag.add('hit_both_strands') print('\nMaking and checking scaffold graph', file=self.log_fh, flush=True) if not self.assembly.scaff_graph_ok: self.status_flag.add('scaffold_graph_bad') print('Comparing assembly against reference sequence', file=self.log_fh, flush=True) self.assembly_compare = assembly_compare.AssemblyCompare( self.final_assembly_fa, self.assembly.sequences, self.reference_fa, self.ref_sequence, self.assembly_compare_prefix, self.refdata, nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, max_gene_nt_extend=self.max_gene_nt_extend, ) self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag(self.status_flag) allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file) self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos) for var_list in self.assembly_variants.values(): for var in var_list: if var[3] not in ['.', 'SYN', None]: self.status_flag.add('has_variant') break if self.status_flag.has('has_variant'): break print('\nCalling variants with samtools:', file=self.log_fh, flush=True) self.samtools_vars = samtools_variants.SamtoolsVariants( self.final_assembly_fa, self.final_assembly_bam, self.samtools_vars_prefix, log_fh=self.log_fh, min_var_read_depth=self.min_var_read_depth, min_second_var_read_depth=self.min_second_var_read_depth, max_allele_freq=self.max_allele_freq ) self.samtools_vars.run() self.total_contig_depths = self.samtools_vars.total_depth_per_contig(self.samtools_vars.contig_depths_file) self.variants_from_samtools = self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file) if len(self.variants_from_samtools): self.status_flag.add('variants_suggest_collapsed_repeat') elif not self.assembled_ok: print('\nAssembly failed\n', file=self.log_fh, flush=True) self.status_flag.add('assembly_fail') elif self.assembly.ref_seq_name is None: print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True) self.status_flag.add('ref_seq_choose_fail') try: self.report_lines = report.report_lines(self) except: print('Error making report for cluster ', self.name, '... traceback:', file=sys.stderr) traceback.print_exc(file=sys.stderr) raise Error('Error making report for cluster ' + self.name) self._clean() atexit.unregister(self._atexit)