def test_empty_input_succeed(self): novoalign = tools.novoalign.NovoalignTool() novoalign.install() # make the input fasta empty inFasta = util.file.mkstempfname('.input.fasta') util.file.touch(inFasta) novoalign.index_fasta(inFasta) inBam = os.path.join(util.file.get_test_input_path(), 'empty.bam') outFasta = util.file.mkstempfname('.refined.fasta') # run refine_assembly args = [ inFasta, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params", "-r Random -l 30 -g 40 -x 20 -t 502 -c {}".format(_CPUS) ] args = assembly.parser_refine_assembly( argparse.ArgumentParser()).parse_args(args) print(args) args.func_main(args) # the expected output is an empty fasta file self.assertTrue(os.path.isfile(outFasta)) self.assertTrue(os.path.getsize(outFasta) == 0)
def test_ref_assisted_assembly(self): novoalign = tools.novoalign.NovoalignTool() novoalign.install() # prep inputs orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta') refGenome = util.file.mkstempfname('.ref.fasta') shutil.copyfile(orig_ref, refGenome) novoalign.index_fasta(refGenome) inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam') outFasta = util.file.mkstempfname('.refined.fasta') # run refine_assembly args = [refGenome, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params", "-r Random -l 30 -g 40 -x 20 -t 502"] args = assembly.parser_refine_assembly().parse_args(args) args.func_main(args) self.assertTrue(os.path.isfile(outFasta)) self.assertTrue(os.path.getsize(outFasta) > 1000) # check assembly quality with open(outFasta, 'rt') as inf: seq = Bio.SeqIO.read(inf, 'fasta') self.assertGreater(len(seq), 17000) self.assertGreater(assembly.unambig_count(seq.seq), len(seq) * 0.95)
def test_empty_input_bam_assembly(self): novoalign = tools.novoalign.NovoalignTool() novoalign.install() # prep inputs orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta') inFasta = util.file.mkstempfname('.ref.fasta') shutil.copyfile(orig_ref, inFasta) novoalign.index_fasta(inFasta) inBam = os.path.join(util.file.get_test_input_path(), 'empty.bam') outFasta = util.file.mkstempfname('.refined.fasta') # run refine_assembly args = [ inFasta, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params", "-r Random -l 30 -g 40 -x 20 -t 502" ] args = assembly.parser_refine_assembly( argparse.ArgumentParser()).parse_args(args) args.func_main(args) # the expected output is an empty fasta file self.assertTrue(os.path.isfile(outFasta)) self.assertTrue(os.path.getsize(outFasta) == 0)
def test_empty_input_fasta_assembly(self): novoalign = tools.novoalign.NovoalignTool() novoalign.install() # make the input fasta empty inFasta = util.file.mkstempfname('.input.fasta') util.file.touch(inFasta) novoalign.index_fasta(inFasta) inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam') outFasta = util.file.mkstempfname('.refined.fasta') # run refine_assembly args = [inFasta, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params", "-r Random -l 30 -g 40 -x 20 -t 502 -c {}".format(_CPUS)] args = assembly.parser_refine_assembly(argparse.ArgumentParser()).parse_args(args) args.func_main(args) # the expected output is an empty fasta file self.assertTrue(os.path.isfile(outFasta)) self.assertTrue(os.path.getsize(outFasta) == 0)
def test_empty_input_bam_assembly(self): novoalign = tools.novoalign.NovoalignTool() novoalign.install() # prep inputs orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta') inFasta = util.file.mkstempfname('.ref.fasta') shutil.copyfile(orig_ref, inFasta) novoalign.index_fasta(inFasta) inBam = os.path.join(util.file.get_test_input_path(), 'empty.bam') outFasta = util.file.mkstempfname('.refined.fasta') # run refine_assembly args = [inFasta, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params", "-r Random -l 30 -g 40 -x 20 -t 502"] args = assembly.parser_refine_assembly(argparse.ArgumentParser()).parse_args(args) args.func_main(args) # the expected output is an empty fasta file self.assertTrue(os.path.isfile(outFasta)) self.assertTrue(os.path.getsize(outFasta) == 0)
def refine_assembly(inFasta, inBam, outFasta, outVcf=None, outBam=None, novo_params='', min_coverage=2, chr_names=[], keep_all_reads=False, JVMmemory=None): ''' This a refinement step where we take a crude assembly, align all reads back to it, and modify the assembly to the majority allele at each position based on read pileups. This step considers both SNPs as well as indels called by GATK and will correct the consensus based on GATK calls. Reads are aligned with Novoalign, then PCR duplicates are removed with Picard (in order to debias the allele counts in the pileups), and realigned with GATK's IndelRealigner (in order to call indels). Output FASTA file is indexed for Picard, Samtools, and Novoalign. ''' # Get tools picard_index = tools.picard.CreateSequenceDictionaryTool() picard_mkdup = tools.picard.MarkDuplicatesTool() samtools = tools.samtools.SamtoolsTool() novoalign = tools.novoalign.NovoalignTool() gatk = tools.gatk.GATKTool() # Create deambiguated genome for GATK deambigFasta = util.file.mkstempfname('.deambig.fasta') deambig_fasta(inFasta, deambigFasta) picard_index.execute(deambigFasta, overwrite=True) samtools.faidx(deambigFasta, overwrite=True) # Novoalign reads to self novoBam = util.file.mkstempfname('.novoalign.bam') min_qual = 0 if keep_all_reads else 1 novoalign.execute(inBam, inFasta, novoBam, options=novo_params.split(), min_qual=min_qual, JVMmemory=JVMmemory) rmdupBam = util.file.mkstempfname('.rmdup.bam') opts = ['CREATE_INDEX=true'] if not keep_all_reads: opts.append('REMOVE_DUPLICATES=true') picard_mkdup.execute([novoBam], rmdupBam, picardOptions=opts, JVMmemory=JVMmemory) os.unlink(novoBam) realignBam = util.file.mkstempfname('.realign.bam') gatk.local_realign(rmdupBam, deambigFasta, realignBam, JVMmemory=JVMmemory) os.unlink(rmdupBam) if outBam: shutil.copyfile(realignBam, outBam) # Modify original assembly with VCF calls from GATK tmpVcf = util.file.mkstempfname('.vcf.gz') tmpFasta = util.file.mkstempfname('.fasta') gatk.ug(realignBam, deambigFasta, tmpVcf, JVMmemory=JVMmemory) os.unlink(realignBam) os.unlink(deambigFasta) name_opts = [] if chr_names: name_opts = ['--name'] + chr_names main_vcf_to_fasta(parser_vcf_to_fasta().parse_args([ tmpVcf, tmpFasta, '--trim_ends', '--min_coverage', str(min_coverage), ] + name_opts)) if outVcf: shutil.copyfile(tmpVcf, outVcf) if outVcf.endswith('.gz'): shutil.copyfile(tmpVcf+'.tbi', outVcf+'.tbi') os.unlink(tmpVcf) shutil.copyfile(tmpFasta, outFasta) os.unlink(tmpFasta) # Index final output FASTA for Picard/GATK, Samtools, and Novoalign picard_index.execute(outFasta, overwrite=True) samtools.faidx(outFasta, overwrite=True) novoalign.index_fasta(outFasta) return 0
def refine_assembly(inFasta, inBam, outFasta, outVcf=None, outBam=None, novo_params='', min_coverage=2, chr_names=None, keep_all_reads=False, JVMmemory=None, threads=1): ''' This a refinement step where we take a crude assembly, align all reads back to it, and modify the assembly to the majority allele at each position based on read pileups. This step considers both SNPs as well as indels called by GATK and will correct the consensus based on GATK calls. Reads are aligned with Novoalign, then PCR duplicates are removed with Picard (in order to debias the allele counts in the pileups), and realigned with GATK's IndelRealigner (in order to call indels). Output FASTA file is indexed for Picard, Samtools, and Novoalign. ''' chr_names = chr_names or [] # Get tools picard_index = tools.picard.CreateSequenceDictionaryTool() picard_mkdup = tools.picard.MarkDuplicatesTool() samtools = tools.samtools.SamtoolsTool() novoalign = tools.novoalign.NovoalignTool() gatk = tools.gatk.GATKTool() # Create deambiguated genome for GATK deambigFasta = util.file.mkstempfname('.deambig.fasta') deambig_fasta(inFasta, deambigFasta) picard_index.execute(deambigFasta, overwrite=True) samtools.faidx(deambigFasta, overwrite=True) # Novoalign reads to self novoBam = util.file.mkstempfname('.novoalign.bam') min_qual = 0 if keep_all_reads else 1 novoalign.execute(inBam, inFasta, novoBam, options=novo_params.split(), min_qual=min_qual, JVMmemory=JVMmemory) rmdupBam = util.file.mkstempfname('.rmdup.bam') opts = ['CREATE_INDEX=true'] if not keep_all_reads: opts.append('REMOVE_DUPLICATES=true') picard_mkdup.execute([novoBam], rmdupBam, picardOptions=opts, JVMmemory=JVMmemory) os.unlink(novoBam) realignBam = util.file.mkstempfname('.realign.bam') gatk.local_realign(rmdupBam, deambigFasta, realignBam, JVMmemory=JVMmemory, threads=threads) os.unlink(rmdupBam) if outBam: shutil.copyfile(realignBam, outBam) # Modify original assembly with VCF calls from GATK tmpVcf = util.file.mkstempfname('.vcf.gz') tmpFasta = util.file.mkstempfname('.fasta') gatk.ug(realignBam, deambigFasta, tmpVcf, JVMmemory=JVMmemory, threads=threads) os.unlink(realignBam) os.unlink(deambigFasta) name_opts = [] if chr_names: name_opts = ['--name'] + chr_names main_vcf_to_fasta(parser_vcf_to_fasta().parse_args([ tmpVcf, tmpFasta, '--trim_ends', '--min_coverage', str(min_coverage), ] + name_opts)) if outVcf: shutil.copyfile(tmpVcf, outVcf) if outVcf.endswith('.gz'): shutil.copyfile(tmpVcf + '.tbi', outVcf + '.tbi') os.unlink(tmpVcf) shutil.copyfile(tmpFasta, outFasta) os.unlink(tmpFasta) # Index final output FASTA for Picard/GATK, Samtools, and Novoalign picard_index.execute(outFasta, overwrite=True) samtools.faidx(outFasta, overwrite=True) novoalign.index_fasta(outFasta) return 0