def refine_assembly_step( fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', iteration=None, subsample=None, seed=None, sample_id='sampleXX', ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): # Temporary directory tempdir = sysutils.create_tempdir('refine_assembly', None, quiet, logfile) if subsample is not None: seed = seed if seed is not None else random.randrange(1, 1000) full1, full2, fullU = fq1, fq2, fqU fq1, fq2, fqU = sample_reads.sample_reads( fq1=full1, fq2=full2, fqU=fullU, outdir=tempdir, nreads=subsample, seed=seed, quiet=False, logfile=logfile, debug=debug ) # Align to reference tmp_aligned, tmp_bt2 = align_reads.align_reads( fq1=fq1, fq2=fq2, fqU=fqU, ref_fa=ref_fa, outdir=tempdir, ncpu=ncpu, xmx=xmx, sample_id=sample_id, keep_tmp=keep_tmp, quiet=quiet, logfile=logfile, debug=debug, ) # Call variants tmp_vcf = call_variants.call_variants( aln_bam=tmp_aligned, ref_fa=ref_fa, outdir=tempdir, emit_all=True, ncpu=ncpu, xmx=xmx, keep_tmp=keep_tmp, quiet=quiet, logfile=logfile, debug=debug, ) # Generate consensus tmp_fasta = vcf_to_consensus.vcf_to_consensus( vcf=tmp_vcf, outdir=tempdir, sampidx=0, keep_tmp=keep_tmp, quiet=quiet, logfile=logfile ) # Copy command if iteration is None: out_refined = os.path.join(outdir, 'refined.fna') out_bt2 = os.path.join(outdir, 'refined_bt2.out') else: out_refined = os.path.join(outdir, 'refined.%02d.fna' % iteration) out_bt2 = os.path.join(outdir, 'refined_bt2.%02d.out' % iteration) shutil.copy(tmp_fasta, out_refined) shutil.copy(tmp_bt2, out_bt2) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'refine_assembly', quiet, logfile) return out_refined, out_bt2
def stageparser(parser): """ Add stage-specific options to argparse parser Args: parser (argparse.ArgumentParser): ArgumentParser object Returns: None """ group1 = parser.add_argument_group('Input/Output') group1.add_argument('--aln_bam', type=sysutils.existing_file, required=True, help='Alignment file.') group1.add_argument('--ref_fa', type=sysutils.existing_file, required=True, help='Reference fasta file.') group1.add_argument('--outdir', type=sysutils.existing_dir, default='.', help='Output directory') group2 = parser.add_argument_group('Variant calling options') group2.add_argument('--emit_all', action='store_true', help='Output calls for all sites.') group2.add_argument('--min_base_qual', type=int, default=15, help='''Minimum base quality required to consider a base for calling.''') group3 = parser.add_argument_group('Settings') group3.add_argument('--ncpu', type=int, help='Number of CPU to use') group3.add_argument('--xmx', type=int, default=sysutils.get_java_heap_size(), help='Maximum heap size for Java VM, in GB.') group3.add_argument('--keep_tmp', action='store_true', help='Do not delete temporary directory') group3.add_argument('--quiet', action='store_true', help='''Do not write output to console (silence stdout and stderr)''') group3.add_argument('--logfile', type=argparse.FileType('a'), help='Append console output to this file') group3.add_argument('--debug', action='store_true', help='Print commands but do not run') parser.set_defaults(func=call_variants)
def stageparser(parser): """ Add stage-specific options to argparse parser Args: parser (argparse.ArgumentParser): ArgumentParser object Returns: None """ group1 = parser.add_argument_group('Input/Output') group1.add_argument('--fq1', type=sysutils.existing_file, help='Fastq file with read 1') group1.add_argument('--fq2', type=sysutils.existing_file, help='Fastq file with read 2') group1.add_argument('--fqU', type=sysutils.existing_file, help='Fastq file with unpaired reads') group1.add_argument('--ref_fa', type=sysutils.existing_file, required=True, help='Assembly to refine') group1.add_argument('--outdir', type=sysutils.existing_dir, default='.', help='Output directory') group2 = parser.add_argument_group('Refinement options') group2.add_argument('--max_step', type=int, default=1, help='Maximum number of refinement steps') group2.add_argument('--subsample', type=int, help='Use a subsample of reads for refinement.') group2.add_argument('--seed', type=int, help='''Seed for random number generator (ignored if not subsampling).''') group2.add_argument('--sample_id', default='sampleXX', help='Sample ID. Used as read group ID in BAM') group3 = parser.add_argument_group('Settings') group3.add_argument('--ncpu', type=int, default=1, help='Number of CPUs to use') group3.add_argument('--xmx', type=int, default=sysutils.get_java_heap_size(), help='Maximum heap size for Java VM, in GB.') group3.add_argument('--keep_tmp', action='store_true', help='Do not delete temporary directory') group3.add_argument('--quiet', action='store_true', help='''Do not write output to console (silence stdout and stderr)''') group3.add_argument('--logfile', type=argparse.FileType('a'), help='Append console output to this file') group3.add_argument('--debug', action='store_true', help='Print commands but do not run') parser.set_defaults(func=refine_assembly)
def stageparser(parser): """ Add stage-specific options to argparse parser Args: parser (argparse.ArgumentParser): ArgumentParser object Returns: None """ group1 = parser.add_argument_group('Input/Output') group1.add_argument('--fq1', type=sysutils.existing_file, help='Fastq file with read 1') group1.add_argument('--fq2', type=sysutils.existing_file, help='Fastq file with read 2') group1.add_argument('--fqU', type=sysutils.existing_file, help='Fastq file with unpaired reads') group1.add_argument('--ref_fa', type=sysutils.existing_file, required=True, help='Reference fasta file.') group1.add_argument('--outdir', type=sysutils.existing_dir, default='.', help='Output directory') group2 = parser.add_argument_group('Alignment options') group2.add_argument('--bt2_preset', default='sensitive-local', choices=[ 'very-fast', 'fast', 'sensitive', 'very-sensitive', 'very-fast-local', 'fast-local', 'sensitive-local', 'very-sensitive-local', ], help='Bowtie2 preset') group2.add_argument('--sample_id', default='sampleXX', help='Sample ID. Used as read group ID in BAM') group2.add_argument('--no_realign', action='store_true', help='Do not realign indels') group2.add_argument('--remove_duplicates', action='store_true', help='''Remove duplicates from final alignment. Otherwise duplicates are marked but not removed.''') group2.add_argument('--encoding', choices=['Phred+33', 'Phred+64'], help='Quality score encoding') group3 = parser.add_argument_group('Settings') group3.add_argument('--ncpu', type=int, default=1, help='Number of CPUs to use') group3.add_argument('--xmx', type=int, default=sysutils.get_java_heap_size(), help='Maximum heap size for Java VM, in GB.') group3.add_argument('--keep_tmp', action='store_true', help='Do not delete temporary directory') group3.add_argument('--quiet', action='store_true', help='''Do not write output to console (silence stdout and stderr)''') group3.add_argument('--logfile', type=argparse.FileType('a'), help='Append console output to this file') group3.add_argument('--debug', action='store_true', help='Print commands but do not run') parser.set_defaults(func=align_reads)
def align_reads( fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', bt2_preset='sensitive-local', sample_id='sampleXX', no_realign=False, remove_duplicates=False, encoding=None, ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to align reads Args: fq1 (str): Path to fastq file with read 1 fq2 (str): Path to fastq file with read 2 fqU (str): Path to fastq file with unpaired reads ref_fa (str): Path to reference fasta file outdir (str): Path to output directory bt2_preset (str): Bowtie2 preset to use for alignment sample_id (str): Read group ID no_realign (bool): Do not realign indels remove_duplicates (bool): Remove duplicates from final alignment encoding (str): Quality score encoding ncpu (int): Number of CPUs to use xmx (int): Maximum heap size for JVM in GB keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_aligned (str): Path to aligned BAM file out_bt2 (str): Path to bowtie2 report """ # Check inputs if fq1 is not None and fq2 is not None and fqU is None: input_reads = "paired" # Paired end elif fq1 is None and fq2 is None and fqU is not None: input_reads = "single" # Single end elif fq1 is not None and fq2 is not None and fqU is not None: input_reads = "both" else: msg = "incorrect input reads; requires either " msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)" raise MissingRequiredArgument(msg) if encoding is None: if input_reads == 'single': encoding = helpers.guess_encoding(fqU) else: encoding = helpers.guess_encoding(fq1) # Check dependencies sysutils.check_dependency('bowtie2') sysutils.check_dependency('samtools') sysutils.check_dependency('picard') # Identify correct command for GATK GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3']) # Set JVM heap argument (for GATK) JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx # Outputs out_aligned = os.path.join(outdir, 'aligned.bam') out_bt2 = os.path.join(outdir, 'aligned.bt2.out') # Temporary directory tempdir = sysutils.create_tempdir('align_reads', None, quiet, logfile) # Copy and index initial reference curref = os.path.join(tempdir, 'initial.fasta') cmd1 = ['cp', ref_fa, curref] cmd2 = ['samtools', 'faidx', curref] cmd3 = [ 'picard', 'CreateSequenceDictionary', 'R=%s' % curref, 'O=%s' % os.path.join(tempdir, 'initial.dict') ] cmd4 = ['bowtie2-build', curref, os.path.join(tempdir, 'initial')] sysutils.command_runner([cmd1, cmd2, cmd3, cmd4], 'align_reads:index', quiet, logfile, debug) # Align with bowtie2 cmd5 = [ 'bowtie2', '-p', '%d' % ncpu, '--phred33' if encoding == "Phred+33" else '--phred64', '--no-unal', '--rg-id', sample_id, '--rg', 'SM:%s' % sample_id, '--rg', 'LB:1', '--rg', 'PU:1', '--rg', 'PL:illumina', '--%s' % bt2_preset, '-x', '%s' % os.path.join(tempdir, 'initial'), ] if input_reads in [ 'paired', 'both', ]: cmd5 += [ '-1', fq1, '-2', fq2, ] elif input_reads in [ 'single', 'both', ]: cmd5 += [ '-U', fqU, ] cmd5 += [ '-S', os.path.join(tempdir, 'aligned.bt2.sam'), ] cmd5 += [ '2>', out_bt2, ] try: sysutils.command_runner([ cmd5, ], 'align_reads:bowtie2', quiet, logfile, debug) except PipelineStepError as e: if os.path.exists(out_bt2): with open(out_bt2, 'r') as fh: print('[--- bowtie2 stderr ---]\n%s' % fh.read(), file=sys.stderr) raise cmd6 = [ 'samtools', 'view', '-u', os.path.join(tempdir, 'aligned.bt2.sam'), '|', 'samtools', 'sort', '>', os.path.join(tempdir, 'sorted.bam'), ] cmd7 = [ 'samtools', 'index', os.path.join(tempdir, 'sorted.bam'), ] sysutils.command_runner([ cmd6, cmd7, ], 'align_reads:samsort', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'sorted.bam') if remove_duplicates: sysutils.log_message('[--- Removing duplicates ---]', quiet, logfile) else: sysutils.log_message('[--- Marking duplicates ---]', quiet, logfile) # MarkDuplicates cmd8 = [ 'picard', 'MarkDuplicates', 'CREATE_INDEX=true', 'USE_JDK_DEFLATER=true', 'USE_JDK_INFLATER=true', 'M=%s' % os.path.join(tempdir, 'rmdup.metrics.txt'), 'I=%s' % cur_bam, 'O=%s' % os.path.join(tempdir, 'rmdup.bam'), ] if remove_duplicates: cmd8 += [ 'REMOVE_DUPLICATES=true', ] sysutils.command_runner([ cmd8, ], 'align_reads:markdups', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'rmdup.bam') if no_realign: print('[--- Skipping realignment ---]', file=sys.stderr) else: # RealignerTargetCreator cmd9 = [ JAVA_HEAP, GATK_BIN, '-T', 'RealignerTargetCreator', '-I', cur_bam, '-R', curref, '-o', os.path.join(tempdir, 'tmp.intervals'), ] # IndelRealigner cmd10 = [ JAVA_HEAP, GATK_BIN, '-T', 'IndelRealigner', '--use_jdk_deflater', '--use_jdk_inflater', '-maxReads', '1000000', '-dt', 'NONE', '-I', cur_bam, '-R', curref, '-targetIntervals', os.path.join(tempdir, 'tmp.intervals'), '-o', os.path.join(tempdir, 'realign.bam') ] sysutils.command_runner([ cmd9, cmd10, ], 'align_reads:realign', quiet, logfile, debug) cur_bam = os.path.join(tempdir, 'realign.bam') # Check that cur_bam was created if not os.path.exists(cur_bam): msg = "BAM does not exist: %s" % cur_bam raise sysutils.PipelineStepError(msg) cmd11a = [ 'rm', '-f', out_aligned, ] cmd11b = [ 'mv', cur_bam, out_aligned, ] cmd11c = [ 'samtools', 'index', out_aligned, ] sysutils.command_runner([ cmd11a, cmd11b, cmd11c, ], 'align_reads:copy', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'align_reads', quiet, logfile) return out_aligned, out_bt2
def call_variants( aln_bam=None, ref_fa=None, outdir='.', emit_all=False, min_base_qual=15, ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to call variants Args: aln_bam (str): Path to alignment file (BAM) ref_fa (str): Path to reference fasta file outdir (str): Path to output directory emit_all (bool): Output calls for all sites min_base_qual (int): Minimum base quality for calling ncpu (int): Number of CPUs to use xmx (int): Maximum heap size for JVM in GB keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_vcf (str): Path to output VCF """ # Check dependencies sysutils.check_dependency('samtools') sysutils.check_dependency('picard') # Identify correct command for GATK GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3']) # Set JVM heap argument (for GATK) JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx # Outputs out_vcf = os.path.join(outdir, 'variants.vcf.gz') # Temporary directory tempdir = sysutils.create_tempdir('call_variants', None, quiet, logfile) # Copy and index initial reference curref = os.path.join(tempdir, 'initial.fasta') cmd1 = ['cp', ref_fa, curref] cmd2 = ['samtools', 'faidx', curref] cmd3 = [ 'picard', 'CreateSequenceDictionary', 'R=%s' % curref, 'O=%s' % os.path.join(tempdir, 'initial.dict') ] # UnifiedGenotyper cmd4 = [ JAVA_HEAP, GATK_BIN, '-T', 'UnifiedGenotyper', '--use_jdk_deflater', '--use_jdk_inflater', '--num_threads', '%d' % ncpu, '-gt_mode', 'DISCOVERY', '-glm', 'BOTH', '--baq', 'OFF', '--useOriginalQualities', '-dt', 'NONE', '--min_base_quality_score', '%d' % min_base_qual, '-ploidy', '4', '-I', aln_bam, '-R', curref, '-o', out_vcf, ] if emit_all: cmd4 += ['-out_mode', 'EMIT_ALL_SITES'] sysutils.command_runner([ cmd1, cmd2, cmd3, cmd4, ], 'call_variants:GATK', quiet, logfile, debug) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'call_variants:GATK', quiet, logfile) return out_vcf
def progressive_refine_assembly( fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.', max_step=None, subsample=None, seed=None, sample_id='sampleXX', ncpu=1, xmx=sysutils.get_java_heap_size(), keep_tmp=False, quiet=False, logfile=None, debug=False, ): # Outputs out_refined = os.path.join(outdir, 'refined.fna') out_bt2 = os.path.join(outdir, 'refined_bt2.out') out_summary = os.path.join(outdir, 'refined_summary.out') #--- Initialize cur_asm = ref_fa cur_alnrate = None assemblies = [OrderedDict(), ] for s in SeqIO.parse(cur_asm, 'fasta'): assemblies[-1][s.id] = s # Message log for summary summary = [ ['iteration', 'alnrate', 'diffs'] + ['diff:%s' % s for s in assemblies[0].keys()] ] # Seed random number generator random.seed(seed) for i in range(1, max_step+1): # Generate a refined assembly tmp_refined, tmp_bt2 = refine_assembly_step( fq1=fq1, fq2=fq2, fqU=fqU, ref_fa=cur_asm, outdir=outdir, iteration=i, subsample=subsample, sample_id=sample_id, ncpu=ncpu, xmx=xmx, keep_tmp=keep_tmp, quiet=True, logfile=logfile, debug=debug ) # Check whether alignments are different diffs = OrderedDict() new_seqs = OrderedDict((s.id, s) for s in SeqIO.parse(tmp_refined, 'fasta')) for id1, seq1 in new_seqs.items(): poss0 = [k for k in assemblies[-1].keys() if sequtils.seqid_match(id1, k)] if len(poss0) == 1: seq0 = assemblies[-1][poss0[0]] else: raise PipelineStepError("Could not match sequence %s" % id1) alns = pairwise2.align.globalms(seq1.seq, seq0.seq, 2, -1, -3, -1) d = min(sum(nc != cc for nc, cc in zip(t[0], t[1])) for t in alns) diffs[id1] = d total_diffs = sum(diffs.values()) # Check new alignment rate with open(tmp_bt2, 'rU') as fh: bt2str = fh.read() m = re.search('(\d+\.\d+)\% overall alignment rate', bt2str) if m is None: msg = "Alignment rate not found in bowtie2 output." msg += "Output file contents:\n%s\n" % bt2str msg += "Aborting." raise PipelineStepError(msg) else: new_alnrate = float(m.group(1)) # Create messages for log row = [str(i), '%.02f' % new_alnrate, '%d' % total_diffs, ] for k0 in assemblies[0].keys(): poss1 = [k for k in diffs.keys() if sequtils.seqid_match(k, k0)] if len(poss1) == 0: row.append('FAIL') elif len(poss1) == 1: row.append(str(diffs[poss1[0]])) else: raise PipelineStepError("Multiple matches for %s" % k0) ######row += list(map(str, diffs.values())) summary.append(row) # Create messages for console sysutils.log_message('\nRefinement result:\n', quiet, logfile) sysutils.log_message('\tDifferences:\n', quiet, logfile) for s,d in diffs.items(): sysutils.log_message('\t\t%s\t%d\n' % (s,d), quiet, logfile) if total_diffs > 0: msg = '\t%d differences found with previous\n' % total_diffs else: msg = '\tNo differences with previous\n' sysutils.log_message(msg, quiet, logfile) if cur_alnrate is None: msg = '\tAlignment rate: %0.2f\n' % new_alnrate elif new_alnrate > cur_alnrate: msg = '\tAlignment rate has improved: ' msg += '%.02f > %.02f\n' % (new_alnrate, cur_alnrate) else: msg = '\tAlignment rate has not improved: ' msg += '%.02f <= %.02f\n' % (new_alnrate, cur_alnrate) sysutils.log_message(msg, quiet, logfile) # Decide whether to keep going keep_going = True if total_diffs == 0: keep_going = False sysutils.log_message('Stopping: no differences found\n', quiet, logfile) # We should also quit if alignment rate does not improve # However, subsampling reads can lead to changes in alignment rate # that can be ignore. When subsampling is implemented the first # boolean value should check whether subsampling is enabled if subsample is None: # not subsampling if cur_alnrate is not None and new_alnrate <= cur_alnrate: keep_going = False msg = 'Stopping: alignment rate did not improve\n' sysutils.log_message(msg, quiet, logfile) cur_asm = tmp_refined cur_alnrate = new_alnrate assemblies.append(new_seqs) if not keep_going: break # Final outputs shutil.copy(cur_asm, out_refined) shutil.copy(tmp_bt2, out_bt2) with open(out_summary, 'w') as outh: print('\n'.join('\t'.join(r) for r in summary), file=outh) return out_refined, out_bt2, out_summary