Esempio n. 1
0
 def test_assemble_with_spades_fail(self):
     '''test _assemble_with_spades handles spades fail'''
     reads1 = os.path.join(
         data_dir, 'assembly_test_assemble_with_spades_fails_reads_1.fq')
     reads2 = os.path.join(
         data_dir, 'assembly_test_assemble_with_spades_fails_reads_2.fq')
     tmp_dir = 'tmp.test_assemble_with_spades_fail'
     tmp_log = 'tmp.test_assemble_with_spades_fail.log'
     with open(tmp_log, 'w') as tmp_log_fh:
         print('First line', file=tmp_log_fh)
         shutil.rmtree(tmp_dir, ignore_errors=True)
         a = assembly.Assembly(reads1,
                               reads2,
                               'not needed',
                               'not needed',
                               tmp_dir,
                               'not_needed_for_this_test.fa',
                               'not_needed_for_this_test.bam',
                               tmp_log_fh,
                               'not needed',
                               assembler="spades",
                               spades_options=" --only-assembler")
         a._assemble_with_spades()
     self.assertFalse(a.assembled_ok)
     shutil.rmtree(tmp_dir, ignore_errors=True)
     os.unlink(tmp_log)
Esempio n. 2
0
 def test_assemble_with_fermilite(self):
     '''test _assemble_with_fermilite'''
     reads1 = os.path.join(data_dir,
                           'assembly_assemble_with_fermilite.reads_1.fq')
     reads2 = os.path.join(data_dir,
                           'assembly_assemble_with_fermilite.reads_2.fq')
     expected_log = os.path.join(
         data_dir, 'assembly_assemble_with_fermilite.expected.log')
     expected_fa = os.path.join(
         data_dir, 'assembly_assemble_with_fermilite.expected.fa')
     tmp_dir = 'tmp.test_assemble_with_fermilite'
     tmp_log = 'tmp.test_assemble_with_fermilite.log'
     tmp_log_fh = open(tmp_log, 'w')
     print('First line', file=tmp_log_fh)
     a = assembly.Assembly(reads1, reads2, 'not needed', 'not needed',
                           tmp_dir, 'not_needed_for_this_test.fa',
                           'not_needed_for_this_test.bam', tmp_log_fh,
                           'not needed')
     a._assemble_with_fermilite()
     self.assertTrue(a.assembled_ok)
     tmp_log_fh.close()
     self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False))
     self.assertTrue(
         filecmp.cmp(expected_fa,
                     os.path.join(tmp_dir, 'debug_all_contigs.fa'),
                     shallow=False))
     shutil.rmtree(tmp_dir)
     os.unlink(tmp_log)
Esempio n. 3
0
 def test_assemble_with_spades(self):
     '''test _assemble_with_spades'''
     reads1 = os.path.join(data_dir,
                           'assembly_test_assemble_with_spades_reads_1.fq')
     reads2 = os.path.join(data_dir,
                           'assembly_test_assemble_with_spades_reads_2.fq')
     tmp_dir = 'tmp.test_assemble_with_spades'
     tmp_log = 'tmp.test_assemble_with_spades.log'
     with open(tmp_log, 'w') as tmp_log_fh:
         print('First line', file=tmp_log_fh)
         shutil.rmtree(tmp_dir, ignore_errors=True)
         #using spades_options=" --only-assembler" because error correction cannot determine quality offset on this
         #artificial dataset
         a = assembly.Assembly(reads1,
                               reads2,
                               'not needed',
                               'not needed',
                               tmp_dir,
                               'not_needed_for_this_test.fa',
                               'not_needed_for_this_test.bam',
                               tmp_log_fh,
                               'not needed',
                               assembler="spades",
                               spades_options=" --only-assembler")
         a._assemble_with_spades()
     self.assertTrue(a.assembled_ok)
     shutil.rmtree(tmp_dir, ignore_errors=True)
     os.unlink(tmp_log)
Esempio n. 4
0
    def _run(self):
        print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '),
              file=self.log_fh,
              flush=True)

        if self.total_reads == 0:
            print('No reads left after filtering with cdhit',
                  file=self.log_fh,
                  flush=True)
            self.assembled_ok = False
        else:
            wanted_reads = self._number_of_reads_for_assembly(
                self.longest_ref_length, self.reads_insert,
                self.total_reads_bases, self.total_reads,
                self.assembly_coverage)
            made_reads = self._make_reads_for_assembly(
                wanted_reads,
                self.total_reads,
                self.all_reads1,
                self.all_reads2,
                self.reads_for_assembly1,
                self.reads_for_assembly2,
                random_seed=self.random_seed)
            print('\nUsing',
                  made_reads,
                  'from a total of',
                  self.total_reads,
                  'for assembly.',
                  file=self.log_fh,
                  flush=True)
            print('Assembling reads:', file=self.log_fh, flush=True)

            self._update_threads()
            self.assembly = assembly.Assembly(
                self.reads_for_assembly1,
                self.reads_for_assembly2,
                self.reference_fa,
                self.references_fa,
                self.assembly_dir,
                self.final_assembly_fa,
                self.final_assembly_bam,
                self.log_fh,
                self.all_refs_fasta,
                contig_name_prefix=self.name,
                assembler=self.assembler,
                extern_progs=self.extern_progs,
                clean=self.clean,
                spades_mode=self.spades_mode,
                spades_options=self.spades_options,
                threads=self.threads)

            self.assembly.run()
            self.assembled_ok = self.assembly.assembled_ok
            self._clean_file(self.reads_for_assembly1)
            self._clean_file(self.reads_for_assembly2)
            if self.clean:
                print('Deleting Assembly directory',
                      self.assembly_dir,
                      file=self.log_fh,
                      flush=True)
                shutil.rmtree(self.assembly_dir, ignore_errors=True)

        if self.assembled_ok and self.assembly.ref_seq_name is not None:
            self.ref_sequence = self.refdata.sequence(
                self.assembly.ref_seq_name)
            is_gene, is_variant_only = self.refdata.sequence_type(
                self.ref_sequence.id)
            self.is_gene = '1' if is_gene == 'p' else '0'
            self.is_variant_only = '1' if is_variant_only else '0'

            print('\nAssembly was successful\n\nMapping reads to assembly:',
                  file=self.log_fh,
                  flush=True)
            self._update_threads()
            mapping.run_bowtie2(
                self.all_reads1,
                self.all_reads2,
                self.final_assembly_fa,
                self.final_assembly_bam[:-4],
                threads=self.threads,
                sort=True,
                bowtie2=self.extern_progs.exe('bowtie2'),
                bowtie2_preset='very-sensitive-local',
                bowtie2_version=self.extern_progs.version('bowtie2'),
                verbose=True,
                verbose_filehandle=self.log_fh)

            if self.assembly.has_contigs_on_both_strands:
                self.status_flag.add('hit_both_strands')

            print('\nMaking and checking scaffold graph',
                  file=self.log_fh,
                  flush=True)

            if not self.assembly.scaff_graph_ok:
                self.status_flag.add('scaffold_graph_bad')

            print('Comparing assembly against reference sequence',
                  file=self.log_fh,
                  flush=True)
            self.assembly_compare = assembly_compare.AssemblyCompare(
                self.final_assembly_fa,
                self.assembly.sequences,
                self.reference_fa,
                self.ref_sequence,
                self.assembly_compare_prefix,
                self.refdata,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
                assembled_threshold=self.assembled_threshold,
                unique_threshold=self.unique_threshold,
                max_gene_nt_extend=self.max_gene_nt_extend,
            )
            self.assembly_compare.run()
            self.status_flag = self.assembly_compare.update_flag(
                self.status_flag)

            allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(
                self.assembly_compare.nucmer_hits)
            assembly_variants_obj = assembly_variants.AssemblyVariants(
                self.refdata, self.assembly_compare.nucmer_snps_file)
            self.assembly_variants = assembly_variants_obj.get_variants(
                self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos)

            for var_list in self.assembly_variants.values():
                for var in var_list:
                    if var[3] not in ['.', 'SYN', None]:
                        self.status_flag.add('has_variant')
                        break

                if self.status_flag.has('has_variant'):
                    break

            print('\nCalling variants with samtools:',
                  file=self.log_fh,
                  flush=True)

            self.samtools_vars = samtools_variants.SamtoolsVariants(
                self.final_assembly_fa,
                self.final_assembly_bam,
                self.samtools_vars_prefix,
                log_fh=self.log_fh,
                min_var_read_depth=self.min_var_read_depth,
                min_second_var_read_depth=self.min_second_var_read_depth,
                max_allele_freq=self.max_allele_freq)
            self.samtools_vars.run()

            self.total_contig_depths = self.samtools_vars.total_depth_per_contig(
                self.samtools_vars.contig_depths_file)

            self.variants_from_samtools = self.samtools_vars.variants_in_coords(
                self.assembly_compare.assembly_match_coords(),
                self.samtools_vars.vcf_file)
            if len(self.variants_from_samtools):
                self.status_flag.add('variants_suggest_collapsed_repeat')
        elif not self.assembled_ok:
            print('\nAssembly failed\n', file=self.log_fh, flush=True)
            self.status_flag.add('assembly_fail')
        elif self.assembly.ref_seq_name is None:
            print('\nCould not get closest reference sequence\n',
                  file=self.log_fh,
                  flush=True)
            self.status_flag.add('ref_seq_choose_fail')

        try:
            self.report_lines = report.report_lines(self)
        except:
            print('Error making report for cluster ',
                  self.name,
                  '... traceback:',
                  file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            raise Error('Error making report for cluster ' + self.name)

        self._clean()
        atexit.unregister(self._atexit)