Esempio n. 1
0
def refine_assembly_step(
        fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.',
        iteration=None, subsample=None, seed=None, sample_id='sampleXX',
        ncpu=1, xmx=sysutils.get_java_heap_size(),
        keep_tmp=False, quiet=False, logfile=None, debug=False,
    ):
    # Temporary directory
    tempdir = sysutils.create_tempdir('refine_assembly', None, quiet, logfile)

    if subsample is not None:
        seed = seed if seed is not None else random.randrange(1, 1000)
        full1, full2, fullU = fq1, fq2, fqU
        fq1, fq2, fqU = sample_reads.sample_reads(
            fq1=full1, fq2=full2, fqU=fullU, outdir=tempdir,
            nreads=subsample, seed=seed,
            quiet=False, logfile=logfile, debug=debug
        )

    # Align to reference
    tmp_aligned, tmp_bt2 = align_reads.align_reads(
        fq1=fq1, fq2=fq2, fqU=fqU, ref_fa=ref_fa, outdir=tempdir,
        ncpu=ncpu, xmx=xmx, sample_id=sample_id,
        keep_tmp=keep_tmp, quiet=quiet, logfile=logfile, debug=debug,
    )

    # Call variants
    tmp_vcf = call_variants.call_variants(
        aln_bam=tmp_aligned, ref_fa=ref_fa, outdir=tempdir,
        emit_all=True,
        ncpu=ncpu, xmx=xmx,
        keep_tmp=keep_tmp, quiet=quiet, logfile=logfile, debug=debug,
    )

    # Generate consensus
    tmp_fasta = vcf_to_consensus.vcf_to_consensus(
        vcf=tmp_vcf, outdir=tempdir, sampidx=0,
        keep_tmp=keep_tmp, quiet=quiet, logfile=logfile
    )

    # Copy command
    if iteration is None:
        out_refined = os.path.join(outdir, 'refined.fna')
        out_bt2 = os.path.join(outdir, 'refined_bt2.out')
    else:
        out_refined = os.path.join(outdir, 'refined.%02d.fna' % iteration)
        out_bt2 = os.path.join(outdir, 'refined_bt2.%02d.out' % iteration)

    shutil.copy(tmp_fasta, out_refined)
    shutil.copy(tmp_bt2, out_bt2)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'refine_assembly', quiet, logfile)

    return out_refined, out_bt2
Esempio n. 2
0
def stageparser(parser):
    """ Add stage-specific options to argparse parser

    Args:
        parser (argparse.ArgumentParser): ArgumentParser object

    Returns:
        None

    """
    group1 = parser.add_argument_group('Input/Output')
    group1.add_argument('--aln_bam',
                        type=sysutils.existing_file,
                        required=True,
                        help='Alignment file.')
    group1.add_argument('--ref_fa',
                        type=sysutils.existing_file,
                        required=True,
                        help='Reference fasta file.')
    group1.add_argument('--outdir',
                        type=sysutils.existing_dir,
                        default='.',
                        help='Output directory')

    group2 = parser.add_argument_group('Variant calling options')
    group2.add_argument('--emit_all',
                        action='store_true',
                        help='Output calls for all sites.')
    group2.add_argument('--min_base_qual',
                        type=int,
                        default=15,
                        help='''Minimum base quality required to consider a
                                base for calling.''')
    group3 = parser.add_argument_group('Settings')
    group3.add_argument('--ncpu', type=int, help='Number of CPU to use')
    group3.add_argument('--xmx',
                        type=int,
                        default=sysutils.get_java_heap_size(),
                        help='Maximum heap size for Java VM, in GB.')
    group3.add_argument('--keep_tmp',
                        action='store_true',
                        help='Do not delete temporary directory')
    group3.add_argument('--quiet',
                        action='store_true',
                        help='''Do not write output to console
                                (silence stdout and stderr)''')
    group3.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        help='Append console output to this file')
    group3.add_argument('--debug',
                        action='store_true',
                        help='Print commands but do not run')
    parser.set_defaults(func=call_variants)
Esempio n. 3
0
def stageparser(parser):
    """ Add stage-specific options to argparse parser

    Args:
        parser (argparse.ArgumentParser): ArgumentParser object

    Returns:
        None

    """
    group1 = parser.add_argument_group('Input/Output')
    group1.add_argument('--fq1', type=sysutils.existing_file,
                        help='Fastq file with read 1')
    group1.add_argument('--fq2', type=sysutils.existing_file,
                        help='Fastq file with read 2')
    group1.add_argument('--fqU', type=sysutils.existing_file,
                        help='Fastq file with unpaired reads')
    group1.add_argument('--ref_fa', type=sysutils.existing_file, required=True,
                        help='Assembly to refine')
    group1.add_argument('--outdir', type=sysutils.existing_dir, default='.',
                        help='Output directory')
    
    group2 = parser.add_argument_group('Refinement options')
    group2.add_argument('--max_step', type=int, default=1,
                        help='Maximum number of refinement steps')
    group2.add_argument('--subsample', type=int,
                        help='Use a subsample of reads for refinement.')
    group2.add_argument('--seed', type=int,
                        help='''Seed for random number generator (ignored if
                                not subsampling).''')
    group2.add_argument('--sample_id', default='sampleXX',
                        help='Sample ID. Used as read group ID in BAM')

    group3 = parser.add_argument_group('Settings')
    group3.add_argument('--ncpu', type=int, default=1,
                        help='Number of CPUs to use')
    group3.add_argument('--xmx', type=int,
                        default=sysutils.get_java_heap_size(),
                        help='Maximum heap size for Java VM, in GB.')
    group3.add_argument('--keep_tmp', action='store_true',
                        help='Do not delete temporary directory')
    group3.add_argument('--quiet', action='store_true',
                        help='''Do not write output to console
                                (silence stdout and stderr)''')
    group3.add_argument('--logfile', type=argparse.FileType('a'),
                        help='Append console output to this file')
    group3.add_argument('--debug', action='store_true',
                        help='Print commands but do not run')
    parser.set_defaults(func=refine_assembly)
Esempio n. 4
0
def stageparser(parser):
    """ Add stage-specific options to argparse parser

    Args:
        parser (argparse.ArgumentParser): ArgumentParser object

    Returns:
        None

    """
    group1 = parser.add_argument_group('Input/Output')
    group1.add_argument('--fq1',
                        type=sysutils.existing_file,
                        help='Fastq file with read 1')
    group1.add_argument('--fq2',
                        type=sysutils.existing_file,
                        help='Fastq file with read 2')
    group1.add_argument('--fqU',
                        type=sysutils.existing_file,
                        help='Fastq file with unpaired reads')
    group1.add_argument('--ref_fa',
                        type=sysutils.existing_file,
                        required=True,
                        help='Reference fasta file.')
    group1.add_argument('--outdir',
                        type=sysutils.existing_dir,
                        default='.',
                        help='Output directory')

    group2 = parser.add_argument_group('Alignment options')
    group2.add_argument('--bt2_preset',
                        default='sensitive-local',
                        choices=[
                            'very-fast',
                            'fast',
                            'sensitive',
                            'very-sensitive',
                            'very-fast-local',
                            'fast-local',
                            'sensitive-local',
                            'very-sensitive-local',
                        ],
                        help='Bowtie2 preset')
    group2.add_argument('--sample_id',
                        default='sampleXX',
                        help='Sample ID. Used as read group ID in BAM')
    group2.add_argument('--no_realign',
                        action='store_true',
                        help='Do not realign indels')
    group2.add_argument('--remove_duplicates',
                        action='store_true',
                        help='''Remove duplicates from final alignment.
                                Otherwise duplicates are marked but not
                                removed.''')
    group2.add_argument('--encoding',
                        choices=['Phred+33', 'Phred+64'],
                        help='Quality score encoding')

    group3 = parser.add_argument_group('Settings')
    group3.add_argument('--ncpu',
                        type=int,
                        default=1,
                        help='Number of CPUs to use')
    group3.add_argument('--xmx',
                        type=int,
                        default=sysutils.get_java_heap_size(),
                        help='Maximum heap size for Java VM, in GB.')
    group3.add_argument('--keep_tmp',
                        action='store_true',
                        help='Do not delete temporary directory')
    group3.add_argument('--quiet',
                        action='store_true',
                        help='''Do not write output to console
                                (silence stdout and stderr)''')
    group3.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        help='Append console output to this file')
    group3.add_argument('--debug',
                        action='store_true',
                        help='Print commands but do not run')
    parser.set_defaults(func=align_reads)
Esempio n. 5
0
def align_reads(
    fq1=None,
    fq2=None,
    fqU=None,
    ref_fa=None,
    outdir='.',
    bt2_preset='sensitive-local',
    sample_id='sampleXX',
    no_realign=False,
    remove_duplicates=False,
    encoding=None,
    ncpu=1,
    xmx=sysutils.get_java_heap_size(),
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to align reads

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        ref_fa (str): Path to reference fasta file
        outdir (str): Path to output directory
        bt2_preset (str): Bowtie2 preset to use for alignment
        sample_id (str): Read group ID
        no_realign (bool): Do not realign indels
        remove_duplicates (bool): Remove duplicates from final alignment
        encoding (str): Quality score encoding
        ncpu (int): Number of CPUs to use
        xmx (int): Maximum heap size for JVM in GB
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_aligned (str): Path to aligned BAM file
        out_bt2 (str): Path to bowtie2 report

    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    if encoding is None:
        if input_reads == 'single':
            encoding = helpers.guess_encoding(fqU)
        else:
            encoding = helpers.guess_encoding(fq1)

    # Check dependencies
    sysutils.check_dependency('bowtie2')
    sysutils.check_dependency('samtools')
    sysutils.check_dependency('picard')

    # Identify correct command for GATK
    GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3'])

    # Set JVM heap argument (for GATK)
    JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx

    # Outputs
    out_aligned = os.path.join(outdir, 'aligned.bam')
    out_bt2 = os.path.join(outdir, 'aligned.bt2.out')

    # Temporary directory
    tempdir = sysutils.create_tempdir('align_reads', None, quiet, logfile)

    # Copy and index initial reference
    curref = os.path.join(tempdir, 'initial.fasta')
    cmd1 = ['cp', ref_fa, curref]
    cmd2 = ['samtools', 'faidx', curref]
    cmd3 = [
        'picard', 'CreateSequenceDictionary',
        'R=%s' % curref,
        'O=%s' % os.path.join(tempdir, 'initial.dict')
    ]
    cmd4 = ['bowtie2-build', curref, os.path.join(tempdir, 'initial')]
    sysutils.command_runner([cmd1, cmd2, cmd3, cmd4], 'align_reads:index',
                            quiet, logfile, debug)

    # Align with bowtie2
    cmd5 = [
        'bowtie2',
        '-p',
        '%d' % ncpu,
        '--phred33' if encoding == "Phred+33" else '--phred64',
        '--no-unal',
        '--rg-id',
        sample_id,
        '--rg',
        'SM:%s' % sample_id,
        '--rg',
        'LB:1',
        '--rg',
        'PU:1',
        '--rg',
        'PL:illumina',
        '--%s' % bt2_preset,
        '-x',
        '%s' % os.path.join(tempdir, 'initial'),
    ]
    if input_reads in [
            'paired',
            'both',
    ]:
        cmd5 += [
            '-1',
            fq1,
            '-2',
            fq2,
        ]
    elif input_reads in [
            'single',
            'both',
    ]:
        cmd5 += [
            '-U',
            fqU,
        ]
    cmd5 += [
        '-S',
        os.path.join(tempdir, 'aligned.bt2.sam'),
    ]
    cmd5 += [
        '2>',
        out_bt2,
    ]

    try:
        sysutils.command_runner([
            cmd5,
        ], 'align_reads:bowtie2', quiet, logfile, debug)
    except PipelineStepError as e:
        if os.path.exists(out_bt2):
            with open(out_bt2, 'r') as fh:
                print('[--- bowtie2 stderr ---]\n%s' % fh.read(),
                      file=sys.stderr)
        raise

    cmd6 = [
        'samtools',
        'view',
        '-u',
        os.path.join(tempdir, 'aligned.bt2.sam'),
        '|',
        'samtools',
        'sort',
        '>',
        os.path.join(tempdir, 'sorted.bam'),
    ]
    cmd7 = [
        'samtools',
        'index',
        os.path.join(tempdir, 'sorted.bam'),
    ]
    sysutils.command_runner([
        cmd6,
        cmd7,
    ], 'align_reads:samsort', quiet, logfile, debug)

    cur_bam = os.path.join(tempdir, 'sorted.bam')

    if remove_duplicates:
        sysutils.log_message('[--- Removing duplicates ---]', quiet, logfile)
    else:
        sysutils.log_message('[--- Marking duplicates ---]', quiet, logfile)

    # MarkDuplicates
    cmd8 = [
        'picard',
        'MarkDuplicates',
        'CREATE_INDEX=true',
        'USE_JDK_DEFLATER=true',
        'USE_JDK_INFLATER=true',
        'M=%s' % os.path.join(tempdir, 'rmdup.metrics.txt'),
        'I=%s' % cur_bam,
        'O=%s' % os.path.join(tempdir, 'rmdup.bam'),
    ]
    if remove_duplicates:
        cmd8 += [
            'REMOVE_DUPLICATES=true',
        ]
    sysutils.command_runner([
        cmd8,
    ], 'align_reads:markdups', quiet, logfile, debug)
    cur_bam = os.path.join(tempdir, 'rmdup.bam')

    if no_realign:
        print('[--- Skipping realignment ---]', file=sys.stderr)
    else:
        # RealignerTargetCreator
        cmd9 = [
            JAVA_HEAP,
            GATK_BIN,
            '-T',
            'RealignerTargetCreator',
            '-I',
            cur_bam,
            '-R',
            curref,
            '-o',
            os.path.join(tempdir, 'tmp.intervals'),
        ]
        # IndelRealigner
        cmd10 = [
            JAVA_HEAP, GATK_BIN, '-T', 'IndelRealigner', '--use_jdk_deflater',
            '--use_jdk_inflater', '-maxReads', '1000000', '-dt', 'NONE', '-I',
            cur_bam, '-R', curref, '-targetIntervals',
            os.path.join(tempdir, 'tmp.intervals'), '-o',
            os.path.join(tempdir, 'realign.bam')
        ]
        sysutils.command_runner([
            cmd9,
            cmd10,
        ], 'align_reads:realign', quiet, logfile, debug)
        cur_bam = os.path.join(tempdir, 'realign.bam')

    # Check that cur_bam was created
    if not os.path.exists(cur_bam):
        msg = "BAM does not exist: %s" % cur_bam
        raise sysutils.PipelineStepError(msg)

    cmd11a = [
        'rm',
        '-f',
        out_aligned,
    ]
    cmd11b = [
        'mv',
        cur_bam,
        out_aligned,
    ]
    cmd11c = [
        'samtools',
        'index',
        out_aligned,
    ]
    sysutils.command_runner([
        cmd11a,
        cmd11b,
        cmd11c,
    ], 'align_reads:copy', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'align_reads', quiet, logfile)

    return out_aligned, out_bt2
Esempio n. 6
0
def call_variants(
    aln_bam=None,
    ref_fa=None,
    outdir='.',
    emit_all=False,
    min_base_qual=15,
    ncpu=1,
    xmx=sysutils.get_java_heap_size(),
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to call variants

    Args:
        aln_bam (str): Path to alignment file (BAM)
        ref_fa (str): Path to reference fasta file
        outdir (str): Path to output directory
        emit_all (bool): Output calls for all sites
        min_base_qual (int): Minimum base quality for calling
        ncpu (int): Number of CPUs to use
        xmx (int): Maximum heap size for JVM in GB
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_vcf (str): Path to output VCF

    """
    # Check dependencies
    sysutils.check_dependency('samtools')
    sysutils.check_dependency('picard')

    # Identify correct command for GATK
    GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3'])

    # Set JVM heap argument (for GATK)
    JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx

    # Outputs
    out_vcf = os.path.join(outdir, 'variants.vcf.gz')

    # Temporary directory
    tempdir = sysutils.create_tempdir('call_variants', None, quiet, logfile)

    # Copy and index initial reference
    curref = os.path.join(tempdir, 'initial.fasta')
    cmd1 = ['cp', ref_fa, curref]
    cmd2 = ['samtools', 'faidx', curref]
    cmd3 = [
        'picard', 'CreateSequenceDictionary',
        'R=%s' % curref,
        'O=%s' % os.path.join(tempdir, 'initial.dict')
    ]

    # UnifiedGenotyper
    cmd4 = [
        JAVA_HEAP,
        GATK_BIN,
        '-T',
        'UnifiedGenotyper',
        '--use_jdk_deflater',
        '--use_jdk_inflater',
        '--num_threads',
        '%d' % ncpu,
        '-gt_mode',
        'DISCOVERY',
        '-glm',
        'BOTH',
        '--baq',
        'OFF',
        '--useOriginalQualities',
        '-dt',
        'NONE',
        '--min_base_quality_score',
        '%d' % min_base_qual,
        '-ploidy',
        '4',
        '-I',
        aln_bam,
        '-R',
        curref,
        '-o',
        out_vcf,
    ]
    if emit_all:
        cmd4 += ['-out_mode', 'EMIT_ALL_SITES']

    sysutils.command_runner([
        cmd1,
        cmd2,
        cmd3,
        cmd4,
    ], 'call_variants:GATK', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'call_variants:GATK', quiet, logfile)

    return out_vcf
Esempio n. 7
0
def progressive_refine_assembly(
        fq1=None, fq2=None, fqU=None, ref_fa=None, outdir='.',
        max_step=None, subsample=None, seed=None, sample_id='sampleXX',
        ncpu=1, xmx=sysutils.get_java_heap_size(),
        keep_tmp=False, quiet=False, logfile=None, debug=False,
    ):

    # Outputs
    out_refined = os.path.join(outdir, 'refined.fna')
    out_bt2 = os.path.join(outdir, 'refined_bt2.out')
    out_summary = os.path.join(outdir, 'refined_summary.out')

    #--- Initialize
    cur_asm = ref_fa
    cur_alnrate = None
    assemblies = [OrderedDict(), ]
    for s in SeqIO.parse(cur_asm, 'fasta'):
        assemblies[-1][s.id] = s
    
    # Message log for summary
    summary = [
        ['iteration', 'alnrate', 'diffs'] + ['diff:%s' % s for s in assemblies[0].keys()]
    ]

    # Seed random number generator
    random.seed(seed)
    
    for i in range(1, max_step+1):
        # Generate a refined assembly
        tmp_refined, tmp_bt2 = refine_assembly_step(
            fq1=fq1, fq2=fq2, fqU=fqU, ref_fa=cur_asm, outdir=outdir,
            iteration=i, subsample=subsample, sample_id=sample_id,
            ncpu=ncpu, xmx=xmx, keep_tmp=keep_tmp,
            quiet=True, logfile=logfile, debug=debug
        )

        # Check whether alignments are different
        diffs = OrderedDict()
        new_seqs = OrderedDict((s.id, s) for s in SeqIO.parse(tmp_refined, 'fasta'))
        for id1, seq1 in new_seqs.items():
            poss0 = [k for k in assemblies[-1].keys() if sequtils.seqid_match(id1, k)]
            if len(poss0) == 1:
                seq0 = assemblies[-1][poss0[0]]
            else:
                raise PipelineStepError("Could not match sequence %s" % id1)
            alns = pairwise2.align.globalms(seq1.seq, seq0.seq, 2, -1, -3, -1)
            d = min(sum(nc != cc for nc, cc in zip(t[0], t[1])) for t in alns)
            diffs[id1] = d

        total_diffs = sum(diffs.values())

        # Check new alignment rate
        with open(tmp_bt2, 'rU') as fh:
            bt2str = fh.read()
            m = re.search('(\d+\.\d+)\% overall alignment rate', bt2str)
            if m is None:
                msg = "Alignment rate not found in bowtie2 output."
                msg += "Output file contents:\n%s\n" % bt2str
                msg += "Aborting."
                raise PipelineStepError(msg)
            else:
                new_alnrate = float(m.group(1))

        # Create messages for log
        row = [str(i), '%.02f' % new_alnrate, '%d' % total_diffs, ]
        for k0 in assemblies[0].keys():
            poss1 = [k for k in diffs.keys() if sequtils.seqid_match(k, k0)]
            if len(poss1) == 0:
                row.append('FAIL')
            elif len(poss1) == 1:
                row.append(str(diffs[poss1[0]]))
            else:
                raise PipelineStepError("Multiple matches for %s" % k0)
        ######row += list(map(str, diffs.values()))
        summary.append(row)

        # Create messages for console
        sysutils.log_message('\nRefinement result:\n', quiet, logfile)
        sysutils.log_message('\tDifferences:\n', quiet, logfile)
        for s,d in diffs.items():
            sysutils.log_message('\t\t%s\t%d\n' % (s,d), quiet, logfile)
        if total_diffs > 0:
            msg = '\t%d differences found with previous\n' % total_diffs
        else:
            msg = '\tNo differences with previous\n'
        sysutils.log_message(msg, quiet, logfile)

        if cur_alnrate is None:
            msg = '\tAlignment rate: %0.2f\n' % new_alnrate
        elif new_alnrate > cur_alnrate:
            msg = '\tAlignment rate has improved: '
            msg += '%.02f > %.02f\n' % (new_alnrate, cur_alnrate)
        else:
            msg = '\tAlignment rate has not improved: '
            msg += '%.02f <= %.02f\n' % (new_alnrate, cur_alnrate)
        sysutils.log_message(msg, quiet, logfile)

        # Decide whether to keep going
        keep_going = True
        if total_diffs == 0:
            keep_going = False
            sysutils.log_message('Stopping: no differences found\n', quiet, logfile)

        # We should also quit if alignment rate does not improve
        # However, subsampling reads can lead to changes in alignment rate
        # that can be ignore. When subsampling is implemented the first
        # boolean value should check whether subsampling is enabled
        if subsample is None: # not subsampling
            if cur_alnrate is not None and new_alnrate <= cur_alnrate:
                keep_going = False
                msg = 'Stopping: alignment rate did not improve\n'
                sysutils.log_message(msg, quiet, logfile)
        
        cur_asm = tmp_refined
        cur_alnrate = new_alnrate
        assemblies.append(new_seqs)

        if not keep_going:
            break

    # Final outputs
    shutil.copy(cur_asm, out_refined)
    shutil.copy(tmp_bt2, out_bt2)

    with open(out_summary, 'w') as outh:
        print('\n'.join('\t'.join(r) for r in summary), file=outh)

    return out_refined, out_bt2, out_summary