Beispiel #1
0
def build_tree_NG(seqs=None, in_type='FASTA', output_name='hp_tree', outdir='.',
                     treedir='hp_tree', model='GTR', bs_trees=None,
                     outgroup=None,branch_length=None, consense=None,rand_tree=None, pars_tree=None,
                     user_tree=None, search=None, search_1random=None, all=None,
                     constraint_tree=None,bsconverge=None, bs_msa=None,
                     bs_tree_cutoff=None,bs_metric=None, bootstrap=None, check=None,
                     log=None, loglh=None, redo=None,
                     terrace=None, seed=12345,version=None, quiet=False,
                     logfile=None, debug=False, ncpu=1,
                     keep_tmp=False):

    sysutils.check_dependency('raxml-ng')

    if version is True:
        cmd2 = ['raxml-ng', '-v']
        sysutils.command_runner([cmd2], 'build_tree_NG', quiet, logfile, debug)
        return

    if seqs is None:
        msg = 'No alignment provided.'
        raise sysutils.PipelineStepError(msg)

    # Set Output Directory
    output_dir = os.path.join(outdir, treedir)
    cmd0 = ['mkdir -p %s' % output_dir]
    sysutils.command_runner([cmd0], 'build_tree_NG', quiet, logfile, debug)

    # fix seq names
    if in_type=='FASTA':
        check_name_compatibility(seqs,os.path.join(output_dir,'seqs_fixednames.fasta'),in_type)
    elif in_type=='PHYLIP':
        check_name_compatibility(seqs, os.path.join(output_dir, 'seqs_fixednames.phy'), in_type)

    # Create temporary directory
    tempdir = sysutils.create_tempdir('build_tree_NG', None, quiet, logfile)

    # start raxml command
    cmd1 = ['raxml-ng', '--prefix %s/%s' % (os.path.abspath(tempdir), output_name), '--threads %d' % ncpu, '--seed %d' % seed, '--model %s' % model]
    if seqs is not None:
        if in_type == 'FASTA':
            cmd1 += ['--msa', '%s' % os.path.join(output_dir,'seqs_fixednames.fasta')]
        elif in_type == 'PHYLIP':
            cmd1 += ['--msa', '%s' % os.path.join(output_dir, 'seqs_fixednames.phy')]
    if branch_length is not None:
        cmd1 += ['--brlen', '%s' % branch_length]
    if consense is not None:
        cmd1 += ['--consense', '%s' % consense]
    if pars_tree is not None and rand_tree is None:
        cmd1 += ['--tree pars{%d}' % pars_tree]
    if pars_tree is None and rand_tree is not None:
        cmd1 += ['--tree rand{%d}' % rand_tree]
    if pars_tree is not None and rand_tree is not None:
        cmd1 += ['--tree pars{%d},rand{%d}' % (pars_tree, rand_tree)]
    if user_tree is not None:
        cmd1 += ['--tree', '%s' % os.path.abspath(user_tree)]
    if search is True:
        cmd1 += ['--search']
    if search_1random is True:
        cmd1 += ['--search1']
    if all is True:
        cmd1 += ['--all']
    if constraint_tree is not None:
        cmd1 += ['--tree-constraint', '%s' % os.path.abspath(constraint_tree)]
    if outgroup is not None:
        cmd1 += ['--outgroup', '%s' % outgroup]
    if bsconverge is True:
        cmd1 += ['--bsconverge']
    if bs_msa is True:
        cmd1 += ['--bsmsa']
    if bs_trees is not None:
        cmd1 += ['--bs-trees %s' % bs_trees]
    if bs_tree_cutoff is not None:
        cmd1 += ['--bs-cutoff', '%f' % bs_tree_cutoff]
    if bs_metric is not None:
        cmd1 += ['--bs-metric', '%s' % bs_metric]
    if bootstrap is True:
        cmd1 += ['--bootstrap']
    if check is True:
        cmd1 += ['--check']
    if log is not None:
        cmd1 += ['--log', '%s' % log]
    if loglh is True:
        cmd1 += ['--loglh']
    if terrace is True:
        cmd1 += ['--terrace']
    if redo is not None:
        cmd1 += ['--redo']

    sysutils.command_runner([cmd1, ], 'build_tree_NG', quiet, logfile, debug)

    # copy files from tmpdir to output directory (note - took some out here)
    if os.path.exists(os.path.join(tempdir, '%s.raxml.bestTree' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.bestTree' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.bestPartitionTrees' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.bestPartitionTrees' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.bestModel' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.bestModel' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.bootstraps' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.bootstraps' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.bootstrapMSA.<REP>.phy' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.bootstrapMSA.<REP>.phy' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.ckp' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.ckp' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.consensusTree' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.consensusTree' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.log' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.log' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.mlTrees' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.mlTrees' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.startTree' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.startTree' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.support' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.support' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.terrace' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.terrace' % output_name), os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, '%s.raxml.terraceNewick' % output_name)):
        shutil.copy(os.path.join(tempdir, '%s.raxml.terraceNewick' % output_name), os.path.abspath(output_dir))

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'build_tree_NG', quiet, logfile)

    cmd6 = ['echo', 'Stage completed. Output files are located here: %s\n' % os.path.abspath(output_dir)]
    sysutils.command_runner([cmd6, ], 'build_tree_NG', quiet, logfile, debug)
Beispiel #2
0
def call_variants(
    aln_bam=None,
    ref_fa=None,
    outdir='.',
    emit_all=False,
    min_base_qual=15,
    ncpu=1,
    xmx=sysutils.get_java_heap_size(),
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to call variants

    Args:
        aln_bam (str): Path to alignment file (BAM)
        ref_fa (str): Path to reference fasta file
        outdir (str): Path to output directory
        emit_all (bool): Output calls for all sites
        min_base_qual (int): Minimum base quality for calling
        ncpu (int): Number of CPUs to use
        xmx (int): Maximum heap size for JVM in GB
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_vcf (str): Path to output VCF

    """
    # Check dependencies
    sysutils.check_dependency('samtools')
    sysutils.check_dependency('picard')

    # Identify correct command for GATK
    GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3'])

    # Set JVM heap argument (for GATK)
    JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx

    # Outputs
    out_vcf = os.path.join(outdir, 'variants.vcf.gz')

    # Temporary directory
    tempdir = sysutils.create_tempdir('call_variants', None, quiet, logfile)

    # Copy and index initial reference
    curref = os.path.join(tempdir, 'initial.fasta')
    cmd1 = ['cp', ref_fa, curref]
    cmd2 = ['samtools', 'faidx', curref]
    cmd3 = [
        'picard', 'CreateSequenceDictionary',
        'R=%s' % curref,
        'O=%s' % os.path.join(tempdir, 'initial.dict')
    ]

    # UnifiedGenotyper
    cmd4 = [
        JAVA_HEAP,
        GATK_BIN,
        '-T',
        'UnifiedGenotyper',
        '--use_jdk_deflater',
        '--use_jdk_inflater',
        '--num_threads',
        '%d' % ncpu,
        '-gt_mode',
        'DISCOVERY',
        '-glm',
        'BOTH',
        '--baq',
        'OFF',
        '--useOriginalQualities',
        '-dt',
        'NONE',
        '--min_base_quality_score',
        '%d' % min_base_qual,
        '-ploidy',
        '4',
        '-I',
        aln_bam,
        '-R',
        curref,
        '-o',
        out_vcf,
    ]
    if emit_all:
        cmd4 += ['-out_mode', 'EMIT_ALL_SITES']

    sysutils.command_runner([
        cmd1,
        cmd2,
        cmd3,
        cmd4,
    ], 'call_variants:GATK', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'call_variants:GATK', quiet, logfile)

    return out_vcf
Beispiel #3
0
def ec_reads(
    fq1=None,
    fq2=None,
    fqU=None,
    outdir='.',
    ncpu=1,
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to error-correct reads using spades

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        outdir (str): Path to output directory
        ncpu (int): Number of CPUs to use
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out1 (str): Path to corrected fastq file with read 1
        out2 (str): Path to corrected fastq file with read 2
        outU (str): Path to corrected fastq file with unpaired reads

    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    # Check dependencies
    sysutils.check_dependency('spades.py')

    # Outputs
    out1 = os.path.join(outdir, 'corrected_1.fastq')
    out2 = os.path.join(outdir, 'corrected_2.fastq')
    outU = os.path.join(outdir, 'corrected_U.fastq')

    # Temporary directory
    tempdir = sysutils.create_tempdir('ec_reads', None, quiet, logfile)

    # spades command
    cmd1 = [
        'spades.py',
        '-o',
        tempdir,
        '-t',
        '%d' % ncpu,
        '--only-error-correction',
    ]
    if input_reads in [
            'paired',
            'both',
    ]:
        cmd1 += [
            '-1',
            os.path.abspath(fq1),
            '-2',
            os.path.abspath(fq2),
        ]
    if input_reads in [
            'single',
            'both',
    ]:
        cmd1 += [
            '-s',
            os.path.abspath(fqU),
        ]

    sysutils.command_runner([
        cmd1,
    ], 'ec_reads', quiet, logfile, debug)

    # Copy files
    yaml_file = os.path.join(tempdir, 'corrected/corrected.yaml')
    if not os.path.exists(yaml_file):
        sysutils.PipelineStepError("YAML file %s not found" % yaml_file)

    with open(yaml_file, 'rU') as fh:
        d = yaml.load(fh, Loader=yaml.FullLoader)[0]
    cmds = []
    if 'left reads' in d:
        cmds.append([
            'gunzip',
            '-c',
        ] + sorted(d['left reads']) + ['>', out1])
    if 'right reads' in d:
        cmds.append([
            'gunzip',
            '-c',
        ] + sorted(d['right reads']) + ['>', out2])
    if 'single reads' in d:
        cmds.append([
            'gunzip',
            '-c',
        ] + sorted(d['single reads']) + ['>', outU])

    sysutils.command_runner(cmds, 'ec_reads', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'ec_reads', quiet, logfile)

    return out1, out2, outU
Beispiel #4
0
def assemble_denovo_trinity(fq1=None,
                            fq2=None,
                            fqU=None,
                            outdir='.',
                            min_contig_length=200,
                            subsample=None,
                            seed=None,
                            ncpu=1,
                            keep_tmp=False,
                            quiet=False,
                            logfile=None,
                            debug=False,
                            **kwargs):
    """ Pipeline step to assemble reads using Trinity (denovo)

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        outdir (str): Path to output directory
        min_contig_length (int): minimum assembled contig length to report
        subsample (int): use a subsample of reads for assembly
        seed (int): Seed for random number generator
        ncpu (int): Number of CPUs to use
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run
        **kwargs: Not used.

    Returns:
        out1 (str): Path to assembled contigs file (fasta format)

    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    # Check dependencies
    sysutils.check_dependency('Trinity')

    # Outputs
    out1 = os.path.join(outdir, 'contigs.fa')

    # Temporary directory
    tempdir = sysutils.create_tempdir('assemble_trinity', None, quiet, logfile)

    # Trinity command
    cmd1 = [
        'Trinity',
        '--min_contig_length',
        '%d' % min_contig_length,
        '--CPU',
        '%d' % ncpu,
        #'--max_memory', '%dG' % max_memory,
        '--seqType',
        'fq',
        '--output',
        tempdir,
    ]
    if input_reads in [
            'paired',
            'both',
    ]:
        cmd1 += [
            '--left',
            os.path.abspath(fq1),
            '--right',
            os.path.abspath(fq2),
        ]
    elif input_reads in [
            'single',
            'both',
    ]:
        cmd1 += [
            '--single',
            os.path.abspath(fqU),
        ]

    # Copy command
    cmd2 = [
        'cp',
        os.path.join(tempdir, 'Trinity.fasta'),
        out1,
    ]

    sysutils.command_runner([
        cmd1,
        cmd2,
    ], 'assemble_trinity', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_trinity', quiet, logfile)

    if os.path.isfile(out1):
        with open(os.path.join(outdir, 'assembly_summary.txt'), 'w') as outh:
            sequtils.assembly_stats(open(out1, 'rU'), outh)

    return out1
Beispiel #5
0
def assemble_scaffold(
        contigs_fa=None, ref_fa=None, outdir='.',
        seqname='sample01',
        keep_tmp=False, quiet=False, logfile=None, debug=False
    ):
    """ Pipeline step to assemble contigs to reference scaffold

    Args:
        contigs_fa (str): Path to fasta file with assembled contigs
        ref_fa (str): Path to reference fasta file
        outdir (str): Path to output directory
        seqname (str): Name to append to scaffold sequence
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_scaffold (str): Path to scaffold FASTA. Reference positions that
                            were not covered have 'n'
        out_imputed (str):  Path to imputed FASTA. Reference positions that
                            were not covered have reference base.
        out_aln (str):      Path to FASTA alignment between scaffold and
                            reference.
        out_padded (str):   Path to output with all contigs aligned to
                            reference.
    """
    # Check dependencies
    sysutils.check_dependency('nucmer')
    sysutils.check_dependency('delta-filter')
    sysutils.check_dependency('show-tiling')
    
    # Outputs
    out_scaffold = os.path.join(outdir, 'scaffold_assembly.fa')
    out_imputed = os.path.join(outdir, 'scaffold_imputed.fa')
    out_aln = os.path.join(outdir, 'scaffold_aligned.fa')
    out_padded = os.path.join(outdir, 'scaffold_padded.out')
    
    # Temporary directory
    tempdir = sysutils.create_tempdir(
        'assemble_scaffold', None, quiet, logfile
    )

    # Create fasta file with sequence IDs only (remove decription)
    tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir)

    with open(out_padded, 'w') as pad_fh:
        scaffolds = alignutils.assemble_to_ref(
            tmp_contigs_fa, ref_fa, tempdir, pad_fh=pad_fh,
            quiet=quiet, logfile=logfile, debug=debug
        )

    # Output scaffolds as FASTA
    with open(out_scaffold, 'w') as outh:
        for ref in sorted(scaffolds.keys()):
            n = '%s.%s' % (ref.split('.')[0], seqname)
            s = scaffolds[ref].scaffold()
            print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh)

    # Output imputed as FASTA
    with open(out_imputed, 'w') as outh:
        for ref in sorted(scaffolds.keys()):
            n = '%s.%s' % (ref.split('.')[0], seqname)
            s = scaffolds[ref].imputed()
            print('>%s\n%s' % (n, sequtils.wrap(s)), file=outh)

    # Output alignments for other pipeline stages
    with open(out_aln, 'w') as outh:
        for ref in sorted(scaffolds.keys()):
            n = '%s.%s' % (ref.split('.')[0], seqname)
            print('>REF|%s\n%s' % (n, scaffolds[ref].raln()), file=outh)
            print('>%s\n%s' % (n, scaffolds[ref].qaln()), file=outh)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_scaffold', quiet, logfile)

    return out_scaffold, out_imputed, out_aln, out_padded
Beispiel #6
0
def trim_reads(
    fq1=None,
    fq2=None,
    fqU=None,
    outdir=".",
    adapter_file=None,
    trimmers=TRIMMERS,
    encoding=None,
    ncpu=1,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to trim reads

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        outdir (str): Path to output directory
        adapter_file (str): Path to adapter file (fasta)
        trimmers (`list` of `str`): Trim commands for trimmomatic
        encoding (str): Quality score encoding
        ncpu (int): Number of CPUs to use
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out1 (str): Path to trimmed fastq file with read 1
        out2 (str): Path to trimmed fastq file with read 2
        outU (str): Path to trimmed fastq file with unpaired reads
        out_summary (str): Path to summary file
    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 and --fq2) OR (--fqU)"
        raise MissingRequiredArgument(msg)
    """ There are two different ways to call Trimmomatic. If using modules on
        C1, the path to the jar file is stored in the "$Trimmomatic"
        environment variable. Otherwise, if using conda, the "trimmomatic"
        script is in PATH.
    """
    # Check dependencies
    try:
        sysutils.check_dependency('trimmomatic')
        cmd1 = ['trimmomatic']
    except PipelineStepError as e:
        if 'Trimmomatic' in os.environ:
            cmd1 = ['java', '-jar', '$Trimmomatic']
        else:
            raise e

    # Get encoding
    if encoding is None:
        if input_reads == 'single':
            encoding = helpers.guess_encoding(fqU)
        else:
            encoding = helpers.guess_encoding(fq1)

    # Outputs for both single and paired
    out_summary = os.path.join(outdir, 'trimmomatic_summary.out')
    outU = os.path.join(outdir, 'trimmed_U.fastq')

    if input_reads is 'single':
        # Outputs
        out1 = out2 = None
        # Trimmomatic command
        cmd1 += [
            'SE',
            '-threads',
            '%d' % ncpu,
            '-phred33' if encoding == "Phred+33" else '-phred64',
            '-summary',
            out_summary,
            fqU,
            outU,
        ]
        # Specify trimming steps
        if adapter_file is not None:
            adapter_file = adapter_file.replace('PE', 'SE')
            cmd1.append("ILLUMINACLIP:%s:2:30:10" % adapter_file)
        cmd1 += trimmers

        # Run command
        sysutils.command_runner([
            cmd1,
        ], 'trim_reads', quiet, logfile, debug)
        return out1, out2, outU
    elif input_reads is 'paired':
        # Outputs
        out1 = os.path.join(outdir, 'trimmed_1.fastq')
        out2 = os.path.join(outdir, 'trimmed_2.fastq')
        tmp1U = os.path.join(outdir, 'tmp1U.fq')
        tmp2U = os.path.join(outdir, 'tmp2U.fq')
        # Trimmomatic command
        cmd1 += [
            'PE',
            '-threads',
            '%d' % ncpu,
            '-phred33' if encoding == "Phred+33" else '-phred64',
            '-summary',
            out_summary,
            fq1,
            fq2,
            out1,
            tmp1U,
            out2,
            tmp2U,
        ]
        # Specify trimming steps
        if adapter_file is not None:
            cmd1.append("ILLUMINACLIP:%s:2:30:10" % adapter_file)
        cmd1 += trimmers

        # Concat files command
        cmd2 = [
            'cat',
            tmp1U,
            tmp2U,
            '>>',
            outU,
        ]
        cmd3 = [
            'rm',
            '-f',
            tmp1U,
            tmp2U,
        ]

        # Run commands
        sysutils.command_runner([
            cmd1,
            cmd2,
            cmd3,
        ], 'trim_reads', quiet, logfile, debug)
        return out1, out2, outU, out_summary
Beispiel #7
0
def build_tree(seqs=None,
               data_type='NUC',
               run_full_analysis=None,
               output_name='build_tree.tre',
               outdir='.',
               treedir='hp_build_tree',
               model='GTRGAMMAIX',
               outgroup=None,
               parsimony_seed=1234,
               wgtFile=None,
               secsub=None,
               bootstrap=None,
               bootstrap_threshold=None,
               numCat=None,
               rand_starting_tree=None,
               convergence_criterion=None,
               likelihoodEpsilon=None,
               excludeFileName=None,
               algo_option=None,
               cat_model=None,
               groupingFile=None,
               placementThreshold=None,
               disable_pattern_compression=None,
               InitialRearrangement=None,
               posteriori=None,
               print_intermediate_trees=None,
               majorityrule=None,
               print_branch_length=None,
               ICTCmetrics=None,
               partition_branch_length=None,
               disable_check=None,
               AAmodel=None,
               multiplemodelFile=None,
               binarytree=None,
               BinaryParameterFile=None,
               SecondaryStructure=None,
               UserStartingTree=None,
               median_GAMMA=None,
               version_info=None,
               rate_heterogeneity=None,
               window=None,
               RapidBootstrapNumSeed=None,
               random_addition=None,
               starting_tree=None,
               quartetGroupingFileName=None,
               multipleTreeFile=None,
               NumberofRuns=None,
               mesquite=None,
               silent=None,
               noseqcheck=None,
               nobfgs=None,
               epaPlaceNum=None,
               epaProbThreshold=None,
               epaLikelihood=None,
               HKY85=None,
               BootstrapPerm=None,
               quiet=False,
               logfile=None,
               debug=False,
               keep_tmp=False,
               option_help=None):

    # Check dependencies
    sysutils.check_dependency('raxmlHPC')

    cmd1 = []

    # check for required input

    if option_help is True:
        cmd4 = ['raxmlHPC', '-h']
        sysutils.command_runner([cmd4], 'build_tree', quiet, logfile, debug)
        return

    if version_info is True:
        cmd5 = ['raxmlHPC', '-v']
        sysutils.command_runner([cmd5], 'build_tree', quiet, logfile, debug)

    if seqs is None and option_help is None:
        msg = 'No alignment provided'
        raise sysutils.PipelineStepError(msg)

    # check model compatibility
    if data_type is not 'AA' and 'PROT' in model:
        msg = 'Protein model given for non-amino acid data'
        raise sysutils.PipelineStepError(msg)
    if data_type is not 'MULTI' and 'MULTI' in model:
        msg = 'Multi-state model given for non-multi-state data'
        raise sysutils.PipelineStepError(msg)
    if data_type is not 'BIN' and 'BIN' in model:
        msg = 'Binary model given for non-binary data'
        raise sysutils.PipelineStepError(msg)
    if data_type is not 'NUC':
        if data_type not in model:
            msg = 'model and data type not compatible'
            raise sysutils.PipelineStepError(msg)

    # Set Output Directory
    output_dir = os.path.join(outdir, treedir)
    cmd0 = ['mkdir -p %s' % output_dir]

    sysutils.command_runner([cmd0], 'build_tree', quiet, logfile, debug)

    # Temporary directory
    tempdir = sysutils.create_tempdir('build_tree', None, quiet, logfile)

    if run_full_analysis is True:
        # generate seeds
        seed1 = random.randint(10000, 99999)
        seed2 = random.randint(10000, 99999)
        cmd1 = [
            'echo',
            'Using parsimony seed %s and bootstrap seed %s' % (seed1, seed2)
        ]
        sysutils.command_runner([cmd1], 'build_tree', quiet, logfile, debug)
        # run raxml
        cmd2 = [
            'raxmlHPC',
            '-w %s' % os.path.abspath(tempdir), '-f a',
            '-p %d' % seed1,
            '-x %d' % seed2, '-# 100',
            '-m %s' % model,
            '-s %s' % os.path.abspath(seqs),
            '-n %s' % output_name
        ]
        sysutils.command_runner([cmd2], 'build_tree', quiet, logfile, debug)

    else:
        # start raxml command
        cmd1 = [
            'raxmlHPC',
            '-w %s' % os.path.abspath(tempdir),
            '-p %d' % parsimony_seed,
            '-m %s' % model
        ]

        if outgroup is not None:
            cmd1 += ['-o', '%s' % outgroup]
        if wgtFile is not None:
            cmd1 += ['-a', '%s' % os.path.join('.', wgtFile)]
        if secsub is not None and SecondaryStructure is not None:
            cmd1 += ['-A', '%s' % secsub]
            cmd1 += ['-S', '%s' % os.path.join('.', SecondaryStructure)]
        elif secsub is not None and SecondaryStructure is None:
            msg = 'Need to specify a file defining the secondary structure via the ­S option'
            raise sysutils.PipelineStepError(msg)
        if bootstrap is not None:
            cmd1 += ['-b', '%d' % bootstrap]
        if bootstrap_threshold is not None:
            cmd1 += ['-B', '%f' % bootstrap_threshold]
        if numCat is not None:
            cmd1 += ['-c', '%d' % numCat]
        if rand_starting_tree is True:
            cmd1 += ['-d']
        if convergence_criterion is True:
            cmd1 += ['-D']
        if likelihoodEpsilon is not None:
            cmd1 += ['-e', '%f' % likelihoodEpsilon]
        if excludeFileName is not None:
            cmd1 += ['-E', '%s' % os.path.join('.', excludeFileName)]
        if algo_option is not None and algo_option in [
                'a', 'A', 'b', 'B', 'c', 'C', 'd', 'D', 'e', 'E', 'F', 'g',
                'G', 'h', 'H', 'i', 'I', 'j', 'J', 'k', 'm', 'n', 'N', 'o',
                'p', 'q', 'r', 'R', 's', 'S', 't', 'T', 'U', 'v', 'V', 'w',
                'W', 'x', 'y'
        ]:
            cmd1 += ['-f', '%s' % algo_option]
        if cat_model is True:
            cmd1 += ['-F']
        if groupingFile is not None:
            cmd1 += ['-g', '%s' % os.path.join('.', groupingFile)]
        if placementThreshold is not None:
            cmd1 += ['-G', '%f' % placementThreshold]
        if disable_pattern_compression is True:
            cmd1 += ['-H']
        if InitialRearrangement is not None:
            cmd1 += ['-i', '%d' % InitialRearrangement]
        if posteriori is not None:
            cmd1 += ['-I', '%s' % posteriori]
        if print_intermediate_trees is True:
            cmd1 += ['-j']
        if (majorityrule is not None) and (multipleTreeFile is not None):
            cmd1 += ['-J', '%s' % majorityrule]
            cmd1 += ['-z', '%s' % os.path.join('.', multipleTreeFile)]
        elif majorityrule is not None and multipleTreeFile is None:
            msg = 'Need to provide a tree file containing several UNROOTED trees via the ­z option'
            raise sysutils.PipelineStepError(msg)
        if print_branch_length is True:
            cmd1 += ['-k']
        if ICTCmetrics is not None:
            cmd1 += ['-L', '%s' % ICTCmetrics]
        if partition_branch_length is True:
            cmd1 += ['-M']
        if disable_check is True:
            cmd1 += ['-O']
        if AAmodel is not None:
            cmd1 += ['-P', '%s' % os.path.join('.', AAmodel)]
        if multiplemodelFile is not None:
            cmd1 += ['-q', '%s' % os.path.join('.', multiplemodelFile)]
        if binarytree is not None:
            cmd1 += ['-r', '%s' % os.path.join('.', binarytree)]
        if BinaryParameterFile is not None:
            cmd1 += ['-R', '%s' % os.path.join('.', BinaryParameterFile)]
        if SecondaryStructure is not None:
            cmd1 += ['-S', '%s' % os.path.join('.', SecondaryStructure)]
        if UserStartingTree is not None:
            cmd1 += ['-t', '%s' % os.path.join('.', UserStartingTree)]
        if median_GAMMA is True:
            cmd1 += ['-u']
        if rate_heterogeneity is True:
            cmd1 += ['-V']
        if window is not None:
            cmd1 += ['-W', '%d' % window]
        if RapidBootstrapNumSeed is not None:
            cmd1 += ['-x', '%d' % RapidBootstrapNumSeed]
        if random_addition is True:
            cmd1 += ['-X']
        if starting_tree is True:
            cmd1 += ['-y']
        if quartetGroupingFileName is not None:
            cmd1 += ['-Y', '%s' % os.path.join('.', quartetGroupingFileName)]
        if multipleTreeFile is not None:
            cmd1 += ['-z', '%s' % os.path.join('.', multipleTreeFile)]
        if NumberofRuns is not None:
            cmd1 += ['-N', '%d' % NumberofRuns]
        if mesquite is True:
            cmd1 += ['--mesquite']
        if silent is True:
            cmd1 += ['--silent']
        if noseqcheck is True:
            cmd1 += ['--no-seq-check']
        if nobfgs is True:
            cmd1 += ['--no-bfgs']
        if epaPlaceNum is not None:
            cmd1 += ['­­epa­keep­placements=%d' % epaPlaceNum]
        if epaProbThreshold is not None:
            cmd1 += ['­­epa­prob­threshold=%f' % epaProbThreshold]
        if epaLikelihood is not None:
            cmd1 += ['­­epa­accumulated­threshold=%f' % epaLikelihood]
        if HKY85 is True:
            cmd1 += ['--HKY85']
        if BootstrapPerm is not None:
            cmd1 += ['[­­bootstop­perms=%s' % BootstrapPerm]
        if option_help is True:
            cmd1 += ['-h']

        cmd1 += ['-s', '%s' % os.path.abspath(seqs), '-n', '%s' % output_name]

        sysutils.command_runner([
            cmd1,
        ], 'build_tree', quiet, logfile, debug)

    # copy files from tmpdir to output directory
    if os.path.exists(os.path.join(tempdir,
                                   'RAxML_bestTree.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_bestTree.%s' % output_name),
                    os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, 'RAxML_info.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_info.%s' % output_name),
                    os.path.abspath(output_dir))
    if os.path.exists(
            os.path.join(tempdir, 'RAxML_perSiteLLs.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_perSiteLLs.%s' % output_name),
                    os.path.abspath(output_dir))
    if os.path.exists(
            os.path.join(tempdir,
                         'RAxML_bipartitionFrequencies.%s' % output_name)):
        shutil.copy(
            os.path.join(tempdir,
                         'RAxML_bipartitionFrequencies.%s' % output_name),
            os.path.abspath(output_dir))
    if os.path.exists(
            os.path.join(tempdir,
                         'RAxML_bipartitionsBranchLabels.%s' % output_name)):
        shutil.copy(
            os.path.join(tempdir,
                         'RAxML_bipartitionsBranchLabels.%s' % output_name),
            os.path.abspath(output_dir))
    if os.path.exists(
            os.path.join(tempdir, 'RAxML_bipartitions.%s' % output_name)):
        shutil.copy(
            os.path.join(tempdir, 'RAxML_bipartitions.%s' % output_name),
            os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir,
                                   'RAxML_bootstrap.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_bootstrap.%s' % output_name),
                    os.path.abspath(output_dir))
    if os.path.exists(
            os.path.join(tempdir, 'RAxML_checkpoint.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_checkpoint.%s' % output_name),
                    os.path.abspath(output_dir))
    if os.path.exists(
            os.path.join(tempdir, 'RAxML_randomTree.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_randomTree.%s' % output_name),
                    os.path.abspath(output_dir))
    if os.path.exists(
            os.path.join(tempdir, 'RAxML_parsimonyTree.%s' % output_name)):
        shutil.copy(
            os.path.join(tempdir, 'RAxML_parsimonyTree.%s' % output_name),
            os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, 'RAxML_result.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_result.%s' % output_name),
                    os.path.abspath(output_dir))
    if os.path.exists(os.path.join(tempdir, 'RAxML_log.%s' % output_name)):
        shutil.copy(os.path.join(tempdir, 'RAxML_log.%s' % output_name),
                    os.path.abspath(output_dir))

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'build_tree', quiet, logfile)

    cmd3 = [
        'echo',
        'Stage completed. Output files are located here: %s\n' %
        os.path.abspath(output_dir)
    ]
    sysutils.command_runner([
        cmd3,
    ], 'build_tree', quiet, logfile, debug)
Beispiel #8
0
def summary_stats(dir_list=None, ph_list=None, quiet=False, logfile=None, debug=False, amplicons=False, outdir='.'):
    # check for samtools
    sysutils.check_dependency('samtools')

    # check for dir_list (required)
    if dir_list is not None:
        f = open(dir_list, 'r')
        filenames = f.read().splitlines()
    else:
        msg = 'no directory list given'
        raise MissingRequiredArgument(msg)

    # count number of samples
    numsamps = 0
    for f in filenames:
        if len(f) > 0:
            numsamps += 1

    # count number of PH files
    numph = 0
    if ph_list is not None:
        p = open(ph_list, 'r')
        phnames = p.read().splitlines()
        for f in phnames:
            if len(f) > 0:
                numph += 1

    tsv_header = []
    tsv_samps = []

    with open(os.path.join(outdir, 'summary_stats.txt'), 'w') as outfile:
        for i in range(numsamps):  # for each sample

            # set file names
            bowtiefile = os.path.join(filenames[i], 'final_bt2.out')
            trimfile = os.path.join(filenames[i], 'trimmomatic_summary.out')
            bamfile = os.path.join(filenames[i], 'final.bam')
            outidxstat = os.path.join(filenames[i], 'final.idxstat.txt')
            finalfina = os.path.join(filenames[i], 'final.fna')
            vcfzipped = os.path.join(filenames[i], 'final.vcf.gz')
            vcfunzipped = os.path.join(filenames[i], 'final.vcf')

            sampname = str(filenames[i])
            num_cols = sampname.count('/') + 1

            if i == 0:  # if the first iteration, create tsv_header
                for x in range(num_cols):
                    tsv_header += ['dir_%s' % str(x)]
                tsv_header += ['RAW', 'CLEAN', 'ALN_RATE']

            # output block 1
            outfile.write("SAMPLE " + "%s:\n" % sampname)
            outfile.write("\t Directory: %s\n" % str(os.path.abspath(filenames[i])))
            raw = search_file(trimfile, "Input Read Pairs").split(' ')[3]
            outfile.write("\t Number of raw read pairs: %s\n" % raw)
            cleaned = search_file(bowtiefile, "reads;").split(' ')[0]
            outfile.write("\t Number of cleaned read pairs: %s\n" % cleaned)
            aln_rate = search_file(bowtiefile, "overall alignment rate").split(' ')[0]
            outfile.write("\t Overall alignment rate: %s\n" % aln_rate)

            # create tsv line
            tsv_samp_temp = []
            tsv_samp_temp += sampname.split('/')
            tsv_samp_temp += [str(raw), str(cleaned), str(aln_rate)]

            # index bam file with samtools
            cmd0 = ["samtools index %s" % bamfile]
            sysutils.command_runner([cmd0, ], 'summary_stats', quiet, logfile, debug)

            # run idxstats with samtools
            cmd1 = ["samtools idxstats %s > %s" % (bamfile, outidxstat)]
            sysutils.command_runner([cmd1, ], 'summary_stats', quiet, logfile, debug)

            # unzip vcf file
            if os.path.isfile(vcfzipped):
                cmd2 = ["gunzip %s" % vcfzipped]
                sysutils.command_runner([cmd2, ], 'summary_stats', quiet, logfile, debug)

            # if amplicon assembly
            if amplicons is True:
                all_amplicons = []
                for record in SeqIO.parse(finalfina, 'fasta'):
                    reg_short = record.name.split('|')[5]
                    all_amplicons.append(str(reg_short))

                    # parse outidxstat and output
                    outfile.write("\t\t Amplicon %s:\n" % reg_short)
                    leng = search_file(outidxstat, reg_short).split('\t')[1]
                    outfile.write("\t\t\t Amplicon length: %s\n" % leng)
                    count = search_file(outidxstat, reg_short).split('\t')[2]
                    outfile.write("\t\t\t Amplicon read count: %s\n" % count)

                    # run depth with samtools for coverage
                    dep = os.path.join(filenames[i], 'final.depth.%s.txt' % reg_short)
                    cmd3 = ["samtools depth -r '%s' %s > %s" % (str(record.name), bamfile, dep)]
                    sysutils.command_runner([cmd3, ], 'summary_stats', quiet, logfile, debug)

                    # parse dep file from samtools
                    lines = 0
                    with open(dep) as depfile:
                        for line in depfile:
                            if len(line) > 0:
                                lines += 1
                    perc = (lines / int(leng)) * 100

                    # output coverage
                    outfile.write("\t\t\t Amplicon coverage: %s (%s percent)\n" % (lines, perc))
                    snps = parse_vcf_file(vcfunzipped, reg_short)
                    outfile.write("\t\t\t Number of SNPS: %s\n" % snps)
                    theta = float(snps) / float(leng)
                    outfile.write("\t\t\t Theta: %1.5f\n\n" % theta)

                    # add to tsv line
                    tsv_samp_temp += [str(leng), str(count), str(lines), str(perc), str(snps), str(theta)]

            # add line to list for tsv
            tsv_samps += [tsv_samp_temp]

        # HAPLOTYPE FILES
        if numph > 0:
            outfile.write("\n\nHAPLPOTYPE SUMMARY STATISTICS\n\n")

            ph_tsv_header = []
            ph_tsv_samps = []

            for i in range(numph):  # for each PH directory
                phfile = os.path.join(phnames[i], 'ph_summary.txt')
                num_cols = phnames[i].count('/') + 1

                if i == 0:  # if the first iteration, create ph_tsv_header
                    for x in range(num_cols):
                        ph_tsv_header += ['dir_%s' % str(x)]
                    ph_tsv_header += ['PH_NUM_HAP', 'PH_HAP_DIVERSITY', 'PH_SEQ_LEN']

                # output from parsing phfile
                outfile.write("PH OUTPUT FILE %s:\n" % phfile)
                num_hap = search_file(phfile, "PH_num_hap").split(' ')[1]
                outfile.write("\t Number of haplotypes: %s\n" % num_hap)
                div = search_file(phfile, "PH_hap_diversity").split(' ')[1]
                outfile.write("\t Haplotype diversity: %s\n" % div)
                seq_len = search_file(phfile, "PH_seq_len").split(' ')[1]
                outfile.write("\t Sequence length: %s\n" % seq_len)

                # create tsv line
                ph_tsv_samp_temp = []
                ph_tsv_samp_temp += phnames[i].split('/')
                ph_tsv_samp_temp += [str(num_hap), str(div), str(seq_len)]

                # add line to ph_tsv_samps
                ph_tsv_samps += [ph_tsv_samp_temp]

    # make summary_stats.tsv file
    with open(os.path.join(outdir, 'summary_stats.tsv'), 'w') as outfile:
        if amplicons is True:
            for amp in all_amplicons:
                tsv_header += ['%s_LEN' % amp, '%s_RC' % amp, '%s_COV_NUM' % amp,
                               '%s_COV_PERC' % amp, '%s_SNPS' % amp, '%s_THETA' % amp]
        outfile.write(('\t').join(tsv_header) + '\n')
        for samp in tsv_samps:
            outfile.write(('\t').join(samp) + '\n')

    # make PH_summary_stats.tsv file
    if ph_list is not None:
        with open(os.path.join(outdir, 'PH_summary_stats.tsv'), 'w') as outfile:
            outfile.write(('\t').join(ph_tsv_header) + '\n')
            for samp in ph_tsv_samps:
                outfile.write(('\t').join(samp) + '\n')

    # ending summary message
    cmd3 = ['echo', 'Stage completed. Summary stats are located here: %s\n' % os.path.abspath('summary_stats.txt')]
    if amplicons is True:
        cmd3 += ['echo', 'Amplicons: %s\n' % (', ').join(all_amplicons)]
    sysutils.command_runner([cmd3, ], 'summary_stats', quiet, logfile, debug)
Beispiel #9
0
def run_mafft(inputseqs=None,
              out_align="alignment.fasta",
              auto=None,
              algo=None,
              sixmerpair=None,
              globalpair=None,
              localpair=None,
              genafpair=None,
              fastapair=None,
              weighti=None,
              retree=None,
              maxiterate=None,
              noscore=None,
              memsave=None,
              parttree=None,
              dpparttree=None,
              fastaparttree=None,
              partsize=None,
              groupsize=None,
              lop=None,
              lep=None,
              lexp=None,
              LOP=None,
              LEXP=None,
              bl=None,
              jtt=None,
              tm=None,
              aamatrix=None,
              fmodel=None,
              clustalout=None,
              inputorder=None,
              reorder=None,
              treeout=None,
              quiet_mafft=None,
              nuc=None,
              amino=None,
              quiet=False,
              logfile=None,
              debug=False,
              ncpu=1,
              msadir='.',
              phylipout=None):
    ### function to run MAFFT ###

    sysutils.check_dependency('mafft')

    ## create MAFFT command using input options
    if algo is None:
        cmd1 = [
            'mafft',
            '--thread',
            '%d' % ncpu,
        ]
    else:
        if algo not in [
                'linsi', 'ginsi', 'einsi', 'fftnsi', 'fftns', 'nwns', 'nwnsi'
        ]:
            msg = 'Algorithm not in MAFFT'
            raise sysutils.PipelineStepError(msg)
        else:
            cmd1 = ['%s' % algo]
    if clustalout is True:
        cmd1 += ['--clustalout']
    if inputorder is True:
        cmd1 += ['--inputourder']
    if reorder is True:
        cmd1 += ['--reorder']
    if treeout is True:
        cmd1 += ['--treeout']
    if quiet_mafft is True:
        cmd1 += ['--quiet']
    if nuc is True:
        cmd1 += ['--nuc']
    if amino is True:
        cmd1 += ['--amino']

    ### algorithm options
    if auto is True:
        cmd1 += ['--auto']
    if sixmerpair is True:
        cmd1 += ['--6merpair']
    if globalpair is True:
        cmd1 += ['--globalpair']
    if localpair is True:
        cmd1 += ['--localpair']
    if genafpair is True:
        cmd1 += ['--genafpair']
    if fastapair is True:
        cmd1 += ['--fastapair']
    if weighti is not None:
        cmd1 += ['--weighti', '%f' % weighti]
    if retree is not None:
        cmd1 += ['--retree', '%d' % retree]
    if maxiterate is not None:
        cmd1 += ['--maxiterate', '%d' % maxiterate]
    if noscore is True:
        cmd1 += ['--noscore']
    if memsave is True:
        cmd1 += ['--memsave']
    if parttree is True:
        cmd1 += ['--parttree']
    if dpparttree is True:
        cmd1 += ['--dpparttree']
    if fastaparttree is True:
        cmd1 += ['--fastaparttree']
    if partsize is not None:
        cmd1 += ['--partsize', '%d' % partsize]
    if groupsize is not None:
        cmd1 += ['--groupsize', '%d' % groupsize]

    ### parameters
    if lop is not None:
        cmd1 += ['--lop', '%f' % lop]
    if lep is not None:
        cmd1 += ['--lep', '%f' % lep]
    if lexp is not None:
        cmd1 += ['--lexp', '%f' % lexp]
    if LOP is not None:
        cmd1 += ['--LOP', '%f' % LOP]
    if LEXP is not None:
        cmd1 += ['--LEXP', '%f' % LEXP]
    if bl is not None:
        cmd1 += ['--bl', '%d' % bl]
    if jtt is not None:
        cmd1 += ['--jtt', '%d' % jtt]
    if tm is not None:
        cmd1 += ['--tm', '%d' % tm]
    if aamatrix is not None:
        cmd1 += ['--aamatrix', '%s' % aamatrix]
    if fmodel is True:
        cmd1 += ['--fmodel']

    # Outputs
    outName = os.path.join(msadir, '%s' % os.path.basename(out_align))

    ## create command
    cmd1 += ['%s' % inputseqs, '>', '%s' % outName]

    ## run MAFFT command
    sysutils.command_runner([
        cmd1,
    ], 'multiple_align', quiet, logfile, debug)

    if phylipout is True:
        phyout = outName[:-6] + '.phy'
        SeqIO.convert(
            outName, 'fasta', phyout,
            'phylip-relaxed')  # relaxed allows for long sequence names
        cmd2 = ['echo', 'Output converted to PHYLIP format from FASTA format.']
        sysutils.command_runner([
            cmd2,
        ], 'multiple_align', quiet, logfile, debug)

    if clustalout is True:
        clustout = outName[:-6] + '.aln'
        cmd3 = ['mv', outName, clustout]
        sysutils.command_runner([
            cmd3,
        ], 'multiple_align', quiet, logfile, debug)
        cmd4 = ['echo', 'Alignment output is in CLUSTAL format.']
        sysutils.command_runner([
            cmd4,
        ], 'multiple_align', quiet, logfile, debug)

    return
Beispiel #10
0
def pairwise_align(
    amplicons_fa=None,
    ref_fa=None,
    ref_gtf=None,
    outdir='.',
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to align amplicons to reference

    Args:
        amplicons_fa (str): Path to fasta file with amplicon sequences
        ref_fa (str): Path to reference fasta file
        ref_gtf (str): Path to reference GTF file with amplicons
        outdir (str): Path to output directory
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_aln (str): Path to alignment in JSON format

    """
    # Check dependencies
    sysutils.check_dependency('blastx')

    # Outputs
    out_aln = os.path.join(outdir, 'alignments.json')

    # Temporary directory
    tempdir = sysutils.create_tempdir('pairwise_align', None, quiet, logfile)

    # Load reference sequence(s)
    refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # Load amplicons from GTF file
    amps = [
        gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon'
    ]
    ampdict = {(gl.chrom, gl.attrs['name']): gl for gl in amps}

    out_json = {
        'aa_alignments': {},
        'nuc_alignments': {},
        'padded_alignments': {},
        'padded_gtf': [],
    }
    # {(sid, ref): [(reg, list(alignment)), ...], ...}
    all_nuc_aln = defaultdict(list)

    for amprec in SeqIO.parse(amplicons_fa, 'fasta'):
        # Get amplicon reference and region from sequence ID
        aid = sequtils.parse_seq_id(amprec.id)
        # Find the GTF line used to orient this amplicon
        try:
            gl = ampdict[(aid['ref'], aid['reg'])]
        except KeyError:
            poss_gl = [t for t in ampdict.keys() if t[1] == aid['reg']]
            gl = ampdict[poss_gl[0]]

        # Start and stop for primary coding region
        pri_s = int(gl.attrs['primary_cds'].split('-')[0]) - 1
        pri_e = int(gl.attrs['primary_cds'].split('-')[1])
        # Start and stop for additional coding regions
        altcds = []
        if 'alt_cds' in gl.attrs:
            for x in gl.attrs['alt_cds'].split(','):
                altcds.append(
                    ((int(x.split('-')[0]) - 1), int(x.split('-')[1])))

        # Align using amino acids
        refseq = matching_refseq(refseqs, aid['ref'])
        alnobj, nuc_aln = baln.alignAA(refseq, amprec, (pri_s, pri_e), altcds,
                                       tempdir, quiet)
        # prialn is a BlastxAlignment object with amplicon aligned to primary cds
        # merged is a nucleotide alignment over the full amplicon, with unaligned regions
        # aligned using alternate cds or nucleotide alignments

        all_nuc_aln[(aid['sid'], aid['ref'])].append((aid['reg'], nuc_aln))
        jid = 'sid|%s|ref|%s|reg|%s|' % (aid['sid'], aid['ref'], aid['reg'])
        out_json['aa_alignments'][jid] = alnobj.aa_align
        out_json['nuc_alignments'][jid] = nuc_aln

    # Full sequence with padding
    for sid, ref in list(all_nuc_aln.keys()):
        _refseq = matching_refseq(refseqs, ref)
        # New name and new alignment
        newname = 'sid|%s|ref|%s|' % (sid, _refseq.id)
        tmp = []
        # Sort all segments by the start position
        segments = sorted(all_nuc_aln[(sid, ref)], key=lambda x: x[1][0][0])
        rpos = qpos = 0
        for sname, seg in segments:
            gr = GTFRow()
            gr.chrom, gr.source, gr.feature = (newname, 'haphpipe', 'amplicon')
            gr.score, gr.strand, gr.frame = ('.', '+', '.')
            gr.attrs['name'] = sname

            # Pad up to first position of segment
            if rpos < seg[0][0]:
                for p in range(rpos, seg[0][0]):
                    tmp.append((p, str(_refseq.seq[p]), '*', qpos))
                    qpos += 1
            gr.start = qpos + 1
            for t in seg:
                if t[3] == -1:
                    tmp.append(t)
                else:
                    tmp.append((t[0], t[1], t[2], qpos))
                    qpos += 1
            # Add annotation line
            gr.end = qpos
            # Include statistics in attributes
            gr.attrs.update(baln.get_seg_stats(seg))
            # Include called regions
            gr.attrs['call_reg'] = '%d-%d' % (gr.start, gr.end)
            gr.attrs['call_len'] = (gr.end - gr.start + 1)
            # Append to json object
            out_json['padded_gtf'].append(str(gr))
            rpos = seg[-1][0] + 1

        # Add padding for end of sequence
        if rpos < len(_refseq.seq):
            for p in range(rpos, len(_refseq.seq)):
                tmp.append((p, str(_refseq.seq[p]), '*', qpos))
                qpos += 1

        # Validate the alignment
        vseq = ''.join(t[2] for t in tmp if t[3] != -1)
        if baln.validate_alignment(tmp, _refseq.seq, vseq):
            if not quiet:
                print('%s alignment validation passed' % newname,
                      file=sys.stderr)
            out_json['padded_alignments'][newname] = tmp

    for s in out_json['padded_gtf']:
        if not quiet:
            print(s, file=sys.stdout)

    with open(out_aln, 'w') as outh:
        print(json.dumps(out_json), file=outh)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'pairwise_align', quiet, logfile)

    return out_aln
Beispiel #11
0
def join_reads(
    fq1=None,
    fq2=None,
    outdir=".",
    min_overlap=None,
    max_overlap=None,
    allow_outies=None,
    encoding=None,
    ncpu=1,
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to join paired-end reads

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        outdir (str): Path to output directory
        min_overlap (int): The minimum required overlap length
        max_overlap (int): Maximum overlap length
        allow_outies (bool): Try combining "outie" reads
        encoding (str): Quality score encoding
        ncpu (int): Number of CPUs to use
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out1 (str): Path to fastq file with unjoined read 1
        out2 (str): Path to fastq file with unjoined read 2
        outU (str): Path to fastq file with joined reads

    """
    # Check inputs
    if fq1 is not None and fq2 is not None:
        pass  # Both are present
    else:
        msg = "Incorrect combination of reads: fq1=%s fq2=%s" % (fq1, fq2)
        raise sysutils.PipelineStepError(msg)

    # Check for executable
    sysutils.check_dependency('flash')

    # Get encoding
    if encoding is None:
        encoding = helpers.guess_encoding(fq1)

    # Outputs
    outU = os.path.join(outdir, 'joined.fastq')
    out1 = os.path.join(outdir, 'notjoined_1.fastq')
    out2 = os.path.join(outdir, 'notjoined_2.fastq')

    # Temporary directory
    tempdir = sysutils.create_tempdir('join_reads', None, quiet, logfile)

    # Flash command
    cmd1 = [
        'flash',
        '-t',
        '%d' % ncpu,
        '-d',
        tempdir,
    ]
    if encoding != "Phred+33":
        cmd1 += ['-p', '64']
    if min_overlap is not None:
        cmd1 += ['-m', '%d' % min_overlap]
    if max_overlap is not None:
        cmd1 += ['-M', '%d' % max_overlap]
    if allow_outies is True:
        cmd1 += ['-O']
    cmd1 += [fq1, fq2]

    cmd2 = [
        'mv',
        os.path.join(tempdir, 'out.extendedFrags.fastq'),
        outU,
    ]
    cmd3 = [
        'mv',
        os.path.join(tempdir, 'out.notCombined_1.fastq'),
        out1,
    ]
    cmd4 = [
        'mv',
        os.path.join(tempdir, 'out.notCombined_2.fastq'),
        out2,
    ]
    sysutils.command_runner([
        cmd1,
        cmd2,
        cmd3,
        cmd4,
    ], 'join_reads', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'join_reads', quiet, logfile)

    return out1, out2, outU
Beispiel #12
0
def sample_reads(
    fq1=None,
    fq2=None,
    fqU=None,
    outdir='.',
    nreads=None,
    frac=None,
    seed=None,
    quiet=False,
    logfile=None,
    debug=False,
):
    """

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        outdir (str): Path to output directory
        nreads (int): Number of reads to sample
        frac (float): Fraction of reads to sample
        seed (int): Seed for random number generator
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out1 (str): Path to sampled fastq file with read 1
        out2 (str): Path to sampled fastq file with read 2
        outU (str): Path to sampled fastq file with unpaired reads
    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    # Check dependencies
    sysutils.check_dependency('seqtk')

    # Set seed
    seed = seed if seed is not None else random.randrange(1, 1000)
    sysutils.log_message('[--- sample_reads ---] Random seed = %d\n' % seed,
                         quiet, logfile)

    # Set nreads/frac
    if frac is not None:
        if frac <= 0 or frac > 1:
            raise sysutils.PipelineStepError('--frac must be > 0 and <= 1.')
        frac_arg = '%f' % frac
    else:
        frac_arg = '%d' % nreads

    cmds = None
    if input_reads == 'single':
        out1 = out2 = None
        outU = os.path.join(outdir, 'sample_U.fastq')
        cmds = [
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fqU,
                frac_arg,
                '>',
                outU,
            ],
        ]
    elif input_reads == 'paired':
        out1 = os.path.join(outdir, 'sample_1.fastq')
        out2 = os.path.join(outdir, 'sample_2.fastq')
        outU = None
        cmds = [
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq1,
                frac_arg,
                '>',
                out1,
            ],
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq2,
                frac_arg,
                '>',
                out2,
            ],
        ]
    elif input_reads == 'both':
        out1 = os.path.join(outdir, 'sample_1.fastq')
        out2 = os.path.join(outdir, 'sample_2.fastq')
        outU = os.path.join(outdir, 'sample_U.fastq')
        cmds = [
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq1,
                frac_arg,
                '>',
                out1,
            ],
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fq2,
                frac_arg,
                '>',
                out2,
            ],
            [
                'seqtk',
                'sample',
                '-s%d' % seed,
                fqU,
                frac_arg,
                '>',
                outU,
            ],
        ]

    sysutils.command_runner(cmds, 'sample_reads', quiet, logfile, debug)
    return out1, out2, outU
Beispiel #13
0
def cliquesnv(fq1=None,
              fq2=None,
              fqU=None,
              ref_fa=None,
              outdir='.',
              jardir='.',
              O22min=None,
              O22minfreq=None,
              printlog=None,
              single=False,
              merging=None,
              fasta_format='extended4',
              outputstart=None,
              outputend=None,
              keep_tmp=False,
              quiet=False,
              logfile=None,
              debug=False,
              ncpu=1):

    # check if paired vs. single
    if fq1 is None and fq2 is None and fqU is not None:
        single = True

    # check dependencies and required arguments
    if fq1 is None and fq2 is None and fqU is None:
        raise MissingRequiredArgument("No fastq files given.")
    if single == False and (fq1 is None or fq2 is None):
        raise MissingRequiredArgument("Either fq1 or fq2 missing.")
    if ref_fa is None:
        raise MissingRequiredArgument("Reference FASTA missing.")

    sysutils.check_dependency('samtools')
    sysutils.check_dependency('bwa')

    if (os.path.isfile(os.path.join(jardir, "clique-snv.jar"))):
        print("CliqueSNV JAR file found.")
    else:
        raise MissingRequiredArgument("No JAR file found.")

    # Temporary directory
    tempdir = sysutils.create_tempdir('clique_snv', None, quiet, logfile)

    # Load reference fasta
    refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # Identify reconstruction regions
    regions = []
    for rname, s in refs.items():
        regions.append(('cs%02d' % (len(regions) + 1), rname, 1, len(s)))

    sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet,
                         logfile)
    for iv in regions:
        sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile)

    if single == False:  #paired end
        # remove .1 and .2 from read names
        fq1_c = os.path.join(tempdir, "fq1_corrected.fastq")
        fq2_c = os.path.join(tempdir, "fq2_corrected.fastq")
        cmd01 = ["cat %s | sed 's/\.1 / /' > %s" % (fq1, fq1_c)]
        cmd02 = ["cat %s | sed 's/\.2 / /' > %s" % (fq2, fq2_c)]
        sysutils.command_runner([cmd01, cmd02], 'clique_snv:setup', quiet,
                                logfile, debug)

        # Create alignment for each REFERENCE in the reconstruction regions
        alnmap = {}
        for cs, rname, spos, epos in regions:
            if rname not in alnmap:
                # Create alignment
                tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap))
                tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap))
                SeqIO.write(refs[rname], tmp_ref_fa, 'fasta')
                cmd1 = [
                    'bwa',
                    'index',
                    tmp_ref_fa,
                ]
                cmd2 = [
                    'bwa',
                    'mem',
                    tmp_ref_fa,
                    fq1_c,
                    fq2_c,
                    '|',
                    'samtools',
                    'view',
                    '-h',
                    '-F',
                    '12',
                    '>',
                    tmp_sam,
                ]
                cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa]
                sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup',
                                        quiet, logfile, debug)
                alnmap[rname] = (tmp_ref_fa, tmp_sam)

    else:  #single read

        # Create alignment for each REFERENCE in the reconstruction regions
        alnmap = {}
        for cs, rname, spos, epos in regions:
            if rname not in alnmap:
                # Create alignment
                tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap))
                tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap))
                SeqIO.write(refs[rname], tmp_ref_fa, 'fasta')
                cmd1 = [
                    'bwa',
                    'index',
                    tmp_ref_fa,
                ]
                cmd2 = [
                    'bwa',
                    'mem',
                    tmp_ref_fa,
                    fqU,
                    '|',
                    'samtools',
                    'view',
                    '-h',
                    '-F',
                    '12',
                    '>',
                    tmp_sam,
                ]
                cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa]
                sysutils.command_runner([cmd1, cmd2, cmd3], 'clique_snv:setup',
                                        quiet, logfile, debug)
                alnmap[rname] = (tmp_ref_fa, tmp_sam)

    # Run CliqueSNV for each region
    cmd4 = ['mkdir -p %s' % os.path.join(outdir, 'clique_snv')]
    sysutils.command_runner([
        cmd4,
    ],
                            stage='cliquesnv',
                            quiet=quiet,
                            logfile=logfile,
                            debug=debug)
    i = 0  #index for filenames
    for cs, rname, spos, epos in regions:
        msg = "Reconstruction region %s:" % cs
        msg += " %s:%d-%d\n" % (rname, spos, epos)
        sysutils.log_message(msg, quiet, logfile)

        # rename the cliquesnv number (cs##) to include region (now: cs##_reg)
        cs = '%s_%s' % (cs, rname.split('|')[-2])

        samfile = os.path.join(tempdir, 'aligned.%d.sam' % i)
        method = 'snv-illumina'
        cmd5 = [
            'java -jar %s -m %s -in %s -threads %d -outDir %s -fdf %s' %
            (os.path.join(jardir, 'clique-snv.jar'), method, samfile, ncpu,
             tempdir, fasta_format)
        ]
        if O22min is not None:
            cmd5 += ['-t %f' % O22min]
        if O22minfreq is not None:
            cmd5 += ['-tf %f' % O22minfreq]
        if printlog is not None:
            cmd5 += ['-log']
        if merging is not None:
            cmd5 += ['-cm %s' % merging]
        if outputstart is not None:
            cmd5 += ['-os %d' % outputstart]
        if outputend is not None:
            cmd5 += ['-oe %d' % outputend]
        sysutils.command_runner([
            cmd5,
        ],
                                stage='clique_snv',
                                quiet=quiet,
                                logfile=logfile,
                                debug=debug)

        # copy output file and delete tempdir
        outname1 = 'aligned.%d.txt' % i
        outname2 = 'aligned.%d.fasta' % i

        os.makedirs(os.path.join(outdir, 'clique_snv/%s' % cs), exist_ok=True)
        if os.path.exists(os.path.join(tempdir, '%s' % outname1)):
            shutil.copy(
                os.path.join(tempdir, '%s' % outname1),
                os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs)))
        if os.path.exists(os.path.join(tempdir, '%s' % outname2)):
            shutil.copy(
                os.path.join(tempdir, '%s' % outname2),
                os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)))

        # parse output file
        with open(
                os.path.join(outdir,
                             'clique_snv/%s/%s_summary.txt' % (cs, cs)),
                'w') as sumfile, open(
                    os.path.join(outdir, 'clique_snv/%s/%s.txt' % (cs, cs)),
                    'r') as infile:
            l = infile.readlines()
            freqs = []
            haps = []
            tempnum = ''
            for line in l:
                if "SNV got" in line:
                    tempnum = line.split(' ')[2]
                if "frequency" in line:
                    freqs += [float(line.split(' ')[2][:-2])]
                if "haplotype=" in line:
                    haps += [line.split('=')[1][1:-2]]
            sumfile.write('CliqueSNV_num_hap\t%s\n' % tempnum)

            freq_sqrd = [x**2 for x in freqs]
            freq_sqrd_sum = sum(freq_sqrd)
            hap_div = ((old_div(7000, (7000 - 1))) * (1 - freq_sqrd_sum))
            sumfile.write('CliqueSNV_hap_diversity\t%s\n' % hap_div)
            sumfile.write('CliqueSNV_seq_len\t%s\n' % len(haps[0]))

        with open(os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)),
                  'r') as fastafile:
            fastadata = fastafile.read().replace('aligned.%d' % i, rname)
            with open(
                    os.path.join(outdir, 'clique_snv/%s/%s.fasta' % (cs, cs)),
                    'w') as newfastafile:
                newfastafile.write(fastadata)

        i += 1

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'clique_snv', quiet, logfile)

    return
Beispiel #14
0
def predict_haplo(
    fq1=None,
    fq2=None,
    ref_fa=None,
    region_txt=None,
    outdir='.',
    min_readlength=36,
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to assemble haplotypes

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        ref_fa (str): Path to reference fasta file
        region_txt (str): Path to region file
        outdir (str): Path to output directory
        min_readlength (int): Minimum readlength passed to PredictHaplo
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        best_fa (list): Path to best haplotype files (FASTA)

    """
    # Check dependencies
    sysutils.check_dependency('PredictHaplo-Paired')
    sysutils.check_dependency('bwa')

    # Temporary directory
    tempdir = sysutils.create_tempdir('predict_haplo', None, quiet, logfile)

    # Load reference fasta
    refs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # Identify reconstruction regions
    regions = []
    if region_txt:
        sysutils.log_message('Found regions file.\n', quiet, logfile)
        for l in open(region_txt, 'r'):
            rname, spos, epos = sequtils.region_to_tuple(l.strip())
            if rname not in refs:
                raise PipelineStepError("ERROR: reference %s not valid" %
                                        rname)
            spos = 1 if spos is None else spos
            epos = len(refs[rname]) if epos is None else epos
            regions.append(('PH%02d' % (len(regions) + 1), rname, spos, epos))
    else:
        for rname, s in refs.items():
            regions.append(('PH%02d' % (len(regions) + 1), rname, 1, len(s)))

    sysutils.log_message('[--- Haplotype Reconstruction Regions ---]\n', quiet,
                         logfile)
    for iv in regions:
        sysutils.log_message('%s -- %s:%d-%d\n' % iv, quiet, logfile)

    # Create alignment for each REFERENCE in the reconstruction regions
    alnmap = {}
    for ph, rname, spos, epos in regions:
        if rname not in alnmap:
            # Create alignment
            tmp_ref_fa = os.path.join(tempdir, 'ref.%d.fa' % len(alnmap))
            tmp_sam = os.path.join(tempdir, 'aligned.%d.sam' % len(alnmap))
            SeqIO.write(refs[rname], tmp_ref_fa, 'fasta')
            cmd1 = [
                'bwa',
                'index',
                tmp_ref_fa,
            ]
            cmd2 = [
                'bwa',
                'mem',
                tmp_ref_fa,
                fq1,
                fq2,
                '|',
                'samtools',
                'view',
                '-h',
                '-F',
                '12',
                '>',
                tmp_sam,
            ]
            cmd3 = ['rm', '-f', '%s.*' % tmp_ref_fa]
            sysutils.command_runner([cmd1, cmd2, cmd3], 'predict_haplo:setup',
                                    quiet, logfile, debug)
            alnmap[rname] = (tmp_ref_fa, tmp_sam)

    best_fa = []
    # Run PredictHaplo for each REGION
    for ph, rname, spos, epos in regions:
        msg = "Reconstruction region %s:" % ph
        msg += " %s:%d-%d\n" % (rname, spos, epos)
        sysutils.log_message(msg, quiet, logfile)

        # Construct params specific for region
        reg_params = dict(DEFAULTS)
        reg_params['min_readlength'] = min_readlength
        reg_params['reconstruction_start'] = spos
        reg_params['reconstruction_stop'] = epos
        reg_params['prefix'] = '%s_out.' % ph

        # Lookup reference and alignment filename
        reg_params['ref_fasta'] = os.path.basename(alnmap[rname][0])
        reg_params['alignment'] = os.path.basename(alnmap[rname][1])

        # Create config file for region
        config_file = '%s.config' % ph
        with open(os.path.join(tempdir, config_file), 'w') as outh:
            tmpconfig = config_template % reg_params
            print(tmpconfig.replace('###', '%'), file=outh)
        try:
            # Run PredictHaplo
            cmd1 = [
                'cd',
                tempdir,
            ]
            cmd2 = [
                'PredictHaplo-Paired', config_file, '&>',
                '%s.log' % config_file
            ]
            sysutils.command_runner([
                cmd1,
                cmd2,
            ], 'predict_haplo:%s' % ph, quiet, logfile, debug)

            # Copy files
            dest = os.path.join(outdir, ph)
            if not os.path.exists(dest):
                os.makedirs(dest)
            shutil.copy(os.path.join(tempdir, '%s.config.log' % ph), dest)
            for f in glob(os.path.join(tempdir, '%s_out*global*.fas' % ph)):
                shutil.copy(f, dest)
            for f in glob(os.path.join(tempdir, '%s_out*global*.html' % ph)):
                shutil.copy(f, dest)
            bf, bh = rename_best(dest, ph)
            best_fa.append((ph, bf))
        except PipelineStepError as e:
            print(e, file=sys.stderr)
            if e.returncode == 139:
                print("PredictHaplo segfaulted", file=sys.stderr)
            best_fa.append((ph, None))

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'predict_haplo', quiet, logfile)

    return best_fa
Beispiel #15
0
def model_test(seqs=None,
               outname='modeltest_results',
               run_id=None,
               data_type='nt',
               partitions=None,
               seed=None,
               topology='ml',
               utree=None,
               force=None,
               asc_bias=None,
               frequencies=None,
               het=None,
               models=None,
               schemes=None,
               template=None,
               ncpu=1,
               quiet=False,
               logfile=None,
               debug=False,
               outdir='.',
               keep_tmp=False):

    # check dependency
    sysutils.check_dependency('modeltest-ng')

    # check required input & input options
    if seqs is None:
        msg = "No alignment given"
        raise sysutils.MissingRequiredArgument(msg)
    if data_type not in ['nt', 'aa']:
        raise sysutils.PipelineStepError("Data type not valid")
    if topology not in [
            'ml', 'mp', 'fixed-ml-jc', 'fixed-ml-gtr', 'fixed-mp', 'random',
            'user'
    ]:
        raise sysutils.PipelineStepError("Topology not valid")

    # make tempdir
    tempdir = sysutils.create_tempdir('model_test', None, quiet, logfile)

    # add prefix
    if run_id is not None:
        outname = run_id + '_' + outname

    # build command
    cmd1 = [
        'modeltest-ng -i %s' % seqs,
        '-t %s' % topology,
        '-o %s' % os.path.join(tempdir, outname),
        '-p %d' % ncpu,
        '-d %s' % data_type
    ]

    if partitions is not None:
        cmd1 += ['-q %s' % partitions]

    if seed is not None:
        cmd1 += ['-r %d' % seed]

    if utree is not None:
        cmd1 += ['-u %s' % utree]

    if force is True:
        cmd1 += ['--force']

    if asc_bias is not None and asc_bias in [
            'lewis', 'felsenstein', 'stamatakis'
    ]:
        cmd1 += ['-a %s' % asc_bias]
    elif asc_bias is not None:
        raise sysutils.PipelineStepError("ASC bias correction not valid")

    if frequencies is not None and frequencies in ['e', 'f']:
        cmd1 += ['-f %s' % frequencies]
    elif frequencies is not None:
        raise sysutils.PipelineStepError("Frequencies not valid")

    if het is not None and het in ['u', 'i', 'g', 'f']:
        cmd1 += ['-h %s' % het]
    elif het is not None:
        raise sysutils.PipelineStepError("Rate heterogeneity not valid")

    if models is not None:
        with open(models, 'r') as f:
            model_list = f.read().splitlines()
        for m in model_list:
            if data_type == 'nt' and m not in [
                    'JC', 'HKY', 'TrN', 'TPM1', 'TPM2', 'TPM3', 'TIM1', 'TIM2',
                    'TIM3', 'TVM', 'GTR'
            ]:
                raise sysutils.PipelineStepError(
                    "At least one model is not valid")
            elif data_type == 'aa' and m not in [
                    'DAYHOFF', 'LG', 'DCMUT', 'JTT', 'MTREV', 'WAG', 'RTREV',
                    'CPREV', 'VT', 'BLOSUM62', 'MTMAM', 'MTART', 'MTZOA',
                    'PMB', 'HIVB', 'HIVW', 'JTTDCMUT', 'FLU', 'SMTREV'
            ]:
                raise sysutils.PipelineStepError(
                    "At least one model is not valid")
        cmd1 += ['-m %s' % str(model_list)[1:-1]]

    if schemes is not None and schemes in [3, 5, 7, 11, 203]:
        cmd1 += ['-s %d' % schemes]
    elif schemes is not None:
        raise sysutils.PipelineStepError("Schemes not valid")

    if template is not None and template in [
            'raxml', 'phyml', 'mrbayes', 'paup'
    ]:
        cmd1 += ['-T %s' % template]
    elif template is not None:
        raise sysutils.PipelineStepError("Template not valid")

    # run command
    try:
        sysutils.command_runner([
            cmd1,
        ], 'model_test', quiet, logfile, debug)
    except sysutils.PipelineStepError as p:
        if p.returncode == -6:
            print("Warning: ignoring returncode -6")
        else:
            raise sysutils.PipelineStepError("Error in ModelTest-NG")

    # copy output file and delete tempdir
    if os.path.exists(os.path.join(tempdir, '%s.out' % outname)):
        shutil.copy(os.path.join(tempdir, '%s.out' % outname),
                    os.path.abspath(outdir))
    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'model_test', quiet, logfile)

    # Parse .out file and write TSV summary file
    criteria = []
    bestmods = []
    with open(os.path.join(outdir, '%s.out' % outname)) as f1:
        for line in f1.read().splitlines():
            if "Best model according to" in line:
                criteria += line.split(' ')[-1:]
            if "Model: " in line:
                bestmods += line.split(' ')[-1:]
    with open(os.path.join(outdir, '%s_summary.tsv' % outname), 'w') as f2:
        f2.write('File\tCriteria\tBest Model\n')
        for i in range(len(criteria)):
            f2.write('%s\t%s\t%s\n' % (seqs, criteria[i], bestmods[i]))

    # completion message
    cmd2 = [
        'echo',
        'Stage completed. Output file is located here: %s\n' %
        os.path.abspath(os.path.join(outdir, '%s.out' % outname)), 'echo',
        'Summary TSV file is located here: %s\n' %
        os.path.abspath(os.path.join(outdir, '%s_summary.tsv' % outname))
    ]
    sysutils.command_runner([
        cmd2,
    ], 'model_test', quiet, logfile, debug)

    return
Beispiel #16
0
def assemble_denovo_spades(fq1=None,
                           fq2=None,
                           fqU=None,
                           outdir='.',
                           no_error_correction=False,
                           subsample=None,
                           seed=None,
                           ncpu=1,
                           keep_tmp=False,
                           quiet=False,
                           logfile=None,
                           debug=False,
                           **kwargs):
    """ Pipeline step to assemble reads using spades (denovo)

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        outdir (str): Path to output directory
        no_error_correction (bool): do not perform error correction
        subsample (int): use a subsample of reads for assembly
        seed (int): Seed for random number generator
        ncpu (int): Number of CPUs to use
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run
        **kwargs: Not used.

    Returns:
        out_fa (str): Path to assembled contigs file (fasta format)
        out_summary (str): Path to assembly summary

    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    # Check dependencies
    sysutils.check_dependency('spades.py')

    # Outputs
    out_fa = os.path.join(outdir, 'denovo_contigs.fna')
    out_summary = os.path.join(outdir, 'denovo_summary.txt')

    # Temporary directory
    tempdir = sysutils.create_tempdir('assemble_spades', None, quiet, logfile)

    # Subsample
    if subsample is not None:
        full1, full2, fullU = fq1, fq2, fqU
        fq1, fq2, fqU = sample_reads.sample_reads(fq1=full1,
                                                  fq2=full2,
                                                  fqU=fullU,
                                                  outdir=tempdir,
                                                  nreads=subsample,
                                                  seed=seed,
                                                  quiet=quiet,
                                                  logfile=logfile,
                                                  debug=debug)

    # spades command
    cmd1 = [
        'spades.py',
        '-o',
        tempdir,
        '-t',
        '%d' % ncpu,
    ]
    if input_reads in [
            'paired',
            'both',
    ]:
        cmd1 += [
            '-1',
            os.path.abspath(fq1),
            '-2',
            os.path.abspath(fq2),
        ]
    if input_reads in [
            'single',
            'both',
    ]:
        cmd1 += [
            '-s',
            os.path.abspath(fqU),
        ]
    if no_error_correction:
        cmd1 += [
            '--only-assembler',
        ]

    sysutils.command_runner([
        cmd1,
    ], 'assemble_spades', quiet, logfile, debug)
    shutil.copy(os.path.join(tempdir, 'contigs.fasta'), out_fa)

    if os.path.isfile(out_fa):
        with open(out_summary, 'w') as outh:
            sequtils.assembly_stats(open(out_fa, 'rU'), outh)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_spades', quiet, logfile)

    return out_fa, out_summary
Beispiel #17
0
def assemble_amplicons(contigs_fa=None,
                       ref_fa=None,
                       ref_gtf=None,
                       outdir='.',
                       sample_id='sampleXX',
                       padding=50,
                       min_contig_len=200,
                       keep_tmp=False,
                       quiet=False,
                       logfile=None,
                       debug=False):
    """ Pipeline step to assemble contigs using reference and amplicon regions

    Args:
        contigs_fa (str): Path to fasta file with assembled contigs
        ref_fa (str): Path to reference fasta file
        ref_gtf (str): Path to reference GTF file with amplicons
        outdir (str): Path to output directory
        sample_id (str): Name to append to scaffold sequence
        padding (int): Bases to include outside reference annotation
        min_contig_len (int): Minimum contig length for tiling path
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_assembly (str): Path to assembled amplicons (FASTA)
        out_summary (str): Path to assembly summary
        out_padded (str): Path to padded output file

    """
    # Check dependencies
    sysutils.check_dependency('nucmer')
    sysutils.check_dependency('delta-filter')
    sysutils.check_dependency('show-tiling')

    # Outputs
    out_assembly = os.path.join(outdir, 'amplicon_assembly.fna')
    out_summary = os.path.join(outdir, 'amplicon_summary.txt')
    out_padded = os.path.join(outdir, 'amplicon_padded.out')
    if os.path.exists(out_padded): os.unlink(out_padded)

    # Temporary directory
    tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet,
                                      logfile)

    # Create fasta file with sequence IDs only (remove decription)
    tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir)

    # Load reference sequence(s)
    refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # For each amplicon, extract the sequence from the reference and scaffold using nucmer
    amplicon_alignments = []
    amps = [
        gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon'
    ]

    for gl in amps:
        msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name'])
        sysutils.log_message(msg, quiet, logfile)
        # Extract reference amplicon
        amp_s = max(0, (gl.start - 1) - padding)
        amp_e = min(len(refseqs[gl.chrom]), gl.end + padding)
        ampseq = refseqs[gl.chrom].seq[amp_s:amp_e]
        amplicon_fa = os.path.join(tempdir, 'subject.fa')
        with open(amplicon_fa, 'w') as outh:
            print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh)
            print(sequtils.wrap(str(ampseq)), file=outh)

        # Align with nucmer
        fil, til = alignutils.align_nucmer(tmp_contigs_fa,
                                           amplicon_fa,
                                           tempdir,
                                           min_contig_len=min_contig_len,
                                           quiet=quiet,
                                           logfile=logfile,
                                           debug=debug)

        # Skip everything else if debugging
        if debug: continue

        # Parse tiling and show alignments
        trows = [alignutils.TilingRow(l) for l in open(til, 'rU')]
        if not trows:
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], None))
        else:
            # Initialize alignment
            amp_seq = SeqIO.read(amplicon_fa, 'fasta')
            combined = alignutils.EmptyReferenceAlignment(
                str(amp_seq.seq).lower())
            for tr in trows:
                out = alignutils.show_aligns(tr.ref, tr.qry, fil)
                for nucaln in alignutils.parse_show_aligns(out):
                    combined = combined.merge_alignments(nucaln)
                    with open(out_padded, 'a') as outh:
                        print('%s\n%s\n%s' %
                              (tr, combined.raln(), combined.qaln()),
                              file=outh)
            amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined))

        # Cleanup
        for f in [fil, til, amplicon_fa]:
            if os.path.isfile(f):
                os.unlink(f)

    # Write to output files
    with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum:
        for ref_id, reg, combined in amplicon_alignments:
            amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg)
            if combined is None:
                msg1 = '%s\tFAIL\t%d' % (amp_id, 0)
                msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼")
                if logfile is not None:
                    print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"),
                          file=logfile)
            else:
                scaf, s, e = combined.scaffold2()
                msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf))
                msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼")
                print('>%s' % (amp_id), file=outseq)
                print('%s' % sequtils.wrap(scaf), file=outseq)

            print(msg1, file=outsum)
            sysutils.log_message(msg2, quiet, logfile)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile)

    return out_assembly, out_summary, out_padded
Beispiel #18
0
def stageparser(parser):
    """ Add stage-specific options to argparse parser

    Args:
        parser (argparse.ArgumentParser): ArgumentParser object

    Returns:
        None

    """
    group1 = parser.add_argument_group('Input/Output')
    group1.add_argument('--fq1',
                        type=sysutils.existing_file,
                        help='Fastq file with read 1')
    group1.add_argument('--fq2',
                        type=sysutils.existing_file,
                        help='Fastq file with read 2')
    group1.add_argument('--fqU',
                        type=sysutils.existing_file,
                        help='Fastq file with unpaired reads')
    group1.add_argument('--outdir',
                        type=sysutils.existing_dir,
                        default='.',
                        help='Output directory')

    group2 = parser.add_argument_group('Assembly options')
    try:
        sysutils.check_dependency('Trinity')
        is_trinity = True
    except sysutils.PipelineStepError:
        is_trinity = False
    try:
        sysutils.check_dependency('spades.py')
        is_spades = True
    except sysutils.PipelineStepError:
        is_spades = False

    if is_trinity and is_spades:
        group2.add_argument('--assembler',
                            default='spades',
                            choices=[
                                'spades',
                                'trinity',
                            ],
                            help='''Assembler to use.''')
    elif is_trinity:
        group2.set_defaults(assembler="trinity")
    elif is_spades:
        group2.set_defaults(assembler="spades")

    if is_spades:
        group2.add_argument(
            '--no_error_correction',
            action='store_true',
            help='Do not perform error correction [spades only]')
    if is_trinity:
        group2.add_argument('--min_contig_length',
                            type=int,
                            default=200,
                            help='''Minimum assembled contig length to report
                                    [Trinity only]''')
    group2.add_argument('--subsample',
                        type=int,
                        help='Use a subsample of reads for assembly.')
    group2.add_argument('--seed',
                        type=int,
                        help='''Seed for random number generator (ignored if
                                not subsampling).''')

    group3 = parser.add_argument_group('Settings')
    group3.add_argument('--ncpu',
                        type=int,
                        default=1,
                        help='Number of CPU to use')
    group3.add_argument('--keep_tmp',
                        action='store_true',
                        help='Keep temporary directory')
    group3.add_argument('--quiet',
                        action='store_true',
                        help='''Do not write output to console
                                (silence stdout and stderr)''')
    group3.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        help='Append console output to this file')
    group3.add_argument('--debug',
                        action='store_true',
                        help='Print commands but do not run')
    parser.set_defaults(func=assemble_denovo)
Beispiel #19
0
def align_reads(
    fq1=None,
    fq2=None,
    fqU=None,
    ref_fa=None,
    outdir='.',
    bt2_preset='sensitive-local',
    sample_id='sampleXX',
    no_realign=False,
    remove_duplicates=False,
    encoding=None,
    ncpu=1,
    xmx=sysutils.get_java_heap_size(),
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to align reads

    Args:
        fq1 (str): Path to fastq file with read 1
        fq2 (str): Path to fastq file with read 2
        fqU (str): Path to fastq file with unpaired reads
        ref_fa (str): Path to reference fasta file
        outdir (str): Path to output directory
        bt2_preset (str): Bowtie2 preset to use for alignment
        sample_id (str): Read group ID
        no_realign (bool): Do not realign indels
        remove_duplicates (bool): Remove duplicates from final alignment
        encoding (str): Quality score encoding
        ncpu (int): Number of CPUs to use
        xmx (int): Maximum heap size for JVM in GB
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_aligned (str): Path to aligned BAM file
        out_bt2 (str): Path to bowtie2 report

    """
    # Check inputs
    if fq1 is not None and fq2 is not None and fqU is None:
        input_reads = "paired"  # Paired end
    elif fq1 is None and fq2 is None and fqU is not None:
        input_reads = "single"  # Single end
    elif fq1 is not None and fq2 is not None and fqU is not None:
        input_reads = "both"
    else:
        msg = "incorrect input reads; requires either "
        msg += "(--fq1 AND --fq2) OR (--fqU) OR (--fq1 AND --fq2 AND --fqU)"
        raise MissingRequiredArgument(msg)

    if encoding is None:
        if input_reads == 'single':
            encoding = helpers.guess_encoding(fqU)
        else:
            encoding = helpers.guess_encoding(fq1)

    # Check dependencies
    sysutils.check_dependency('bowtie2')
    sysutils.check_dependency('samtools')
    sysutils.check_dependency('picard')

    # Identify correct command for GATK
    GATK_BIN = sysutils.determine_dependency_path(['gatk', 'gatk3'])

    # Set JVM heap argument (for GATK)
    JAVA_HEAP = '_JAVA_OPTIONS="-Xmx%dg"' % xmx

    # Outputs
    out_aligned = os.path.join(outdir, 'aligned.bam')
    out_bt2 = os.path.join(outdir, 'aligned.bt2.out')

    # Temporary directory
    tempdir = sysutils.create_tempdir('align_reads', None, quiet, logfile)

    # Copy and index initial reference
    curref = os.path.join(tempdir, 'initial.fasta')
    cmd1 = ['cp', ref_fa, curref]
    cmd2 = ['samtools', 'faidx', curref]
    cmd3 = [
        'picard', 'CreateSequenceDictionary',
        'R=%s' % curref,
        'O=%s' % os.path.join(tempdir, 'initial.dict')
    ]
    cmd4 = ['bowtie2-build', curref, os.path.join(tempdir, 'initial')]
    sysutils.command_runner([cmd1, cmd2, cmd3, cmd4], 'align_reads:index',
                            quiet, logfile, debug)

    # Align with bowtie2
    cmd5 = [
        'bowtie2',
        '-p',
        '%d' % ncpu,
        '--phred33' if encoding == "Phred+33" else '--phred64',
        '--no-unal',
        '--rg-id',
        sample_id,
        '--rg',
        'SM:%s' % sample_id,
        '--rg',
        'LB:1',
        '--rg',
        'PU:1',
        '--rg',
        'PL:illumina',
        '--%s' % bt2_preset,
        '-x',
        '%s' % os.path.join(tempdir, 'initial'),
    ]
    if input_reads in [
            'paired',
            'both',
    ]:
        cmd5 += [
            '-1',
            fq1,
            '-2',
            fq2,
        ]
    elif input_reads in [
            'single',
            'both',
    ]:
        cmd5 += [
            '-U',
            fqU,
        ]
    cmd5 += [
        '-S',
        os.path.join(tempdir, 'aligned.bt2.sam'),
    ]
    cmd5 += [
        '2>',
        out_bt2,
    ]

    try:
        sysutils.command_runner([
            cmd5,
        ], 'align_reads:bowtie2', quiet, logfile, debug)
    except PipelineStepError as e:
        if os.path.exists(out_bt2):
            with open(out_bt2, 'r') as fh:
                print('[--- bowtie2 stderr ---]\n%s' % fh.read(),
                      file=sys.stderr)
        raise

    cmd6 = [
        'samtools',
        'view',
        '-u',
        os.path.join(tempdir, 'aligned.bt2.sam'),
        '|',
        'samtools',
        'sort',
        '>',
        os.path.join(tempdir, 'sorted.bam'),
    ]
    cmd7 = [
        'samtools',
        'index',
        os.path.join(tempdir, 'sorted.bam'),
    ]
    sysutils.command_runner([
        cmd6,
        cmd7,
    ], 'align_reads:samsort', quiet, logfile, debug)

    cur_bam = os.path.join(tempdir, 'sorted.bam')

    if remove_duplicates:
        sysutils.log_message('[--- Removing duplicates ---]', quiet, logfile)
    else:
        sysutils.log_message('[--- Marking duplicates ---]', quiet, logfile)

    # MarkDuplicates
    cmd8 = [
        'picard',
        'MarkDuplicates',
        'CREATE_INDEX=true',
        'USE_JDK_DEFLATER=true',
        'USE_JDK_INFLATER=true',
        'M=%s' % os.path.join(tempdir, 'rmdup.metrics.txt'),
        'I=%s' % cur_bam,
        'O=%s' % os.path.join(tempdir, 'rmdup.bam'),
    ]
    if remove_duplicates:
        cmd8 += [
            'REMOVE_DUPLICATES=true',
        ]
    sysutils.command_runner([
        cmd8,
    ], 'align_reads:markdups', quiet, logfile, debug)
    cur_bam = os.path.join(tempdir, 'rmdup.bam')

    if no_realign:
        print('[--- Skipping realignment ---]', file=sys.stderr)
    else:
        # RealignerTargetCreator
        cmd9 = [
            JAVA_HEAP,
            GATK_BIN,
            '-T',
            'RealignerTargetCreator',
            '-I',
            cur_bam,
            '-R',
            curref,
            '-o',
            os.path.join(tempdir, 'tmp.intervals'),
        ]
        # IndelRealigner
        cmd10 = [
            JAVA_HEAP, GATK_BIN, '-T', 'IndelRealigner', '--use_jdk_deflater',
            '--use_jdk_inflater', '-maxReads', '1000000', '-dt', 'NONE', '-I',
            cur_bam, '-R', curref, '-targetIntervals',
            os.path.join(tempdir, 'tmp.intervals'), '-o',
            os.path.join(tempdir, 'realign.bam')
        ]
        sysutils.command_runner([
            cmd9,
            cmd10,
        ], 'align_reads:realign', quiet, logfile, debug)
        cur_bam = os.path.join(tempdir, 'realign.bam')

    # Check that cur_bam was created
    if not os.path.exists(cur_bam):
        msg = "BAM does not exist: %s" % cur_bam
        raise sysutils.PipelineStepError(msg)

    cmd11a = [
        'rm',
        '-f',
        out_aligned,
    ]
    cmd11b = [
        'mv',
        cur_bam,
        out_aligned,
    ]
    cmd11c = [
        'samtools',
        'index',
        out_aligned,
    ]
    sysutils.command_runner([
        cmd11a,
        cmd11b,
        cmd11c,
    ], 'align_reads:copy', quiet, logfile, debug)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'align_reads', quiet, logfile)

    return out_aligned, out_bt2
Beispiel #20
0
def demo(outdir=".", refonly=False):
    try:
        _ = FileNotFoundError()
    except NameError:

        class FileNotFoundError(OSError):
            pass

    # This file, demo.py, is located within "stages", so the package root is
    # up one directory
    _base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    #_data = os.path.abspath(os.path.join(_base,'refs'))
    #_data = os.path.abspath(os.path.join(os.path.dirname(_base), 'bin/refs'))
    #print(_data)
    #return
    #_data = os.path.join(_base, 'data')

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    hpd = os.path.join(outdir, 'haphpipe_demo')
    if not os.path.exists(hpd):
        os.makedirs(hpd)

    refs = os.path.join(outdir, 'haphpipe_demo/refs.tar.gz')

    # download ref command
    cmd1 = [
        'curl', '-L',
        'https://github.com/gwcbi/haphpipe/blob/master/bin/refs.tar.gz?raw=true',
        '>', refs
    ]

    sysutils.command_runner([
        cmd1,
    ], 'refs')

    # unzip refs
    cmd2 = ['tar', '-xzvf', 'haphpipe_demo/refs.tar.gz', '-C', hpd]
    cmd3 = ['rm', refs]

    sysutils.command_runner([
        cmd2,
        cmd3,
    ], 'refs')

    #dest = os.path.abspath(outdir)
    #if not os.path.exists(os.path.join(outdir,))

    print(_base, file=sys.stderr)

    if refonly is False:
        print(
            "Setting up demo directories and references in outdirectory %s. Demo samples will now run."
            % os.path.join(outdir, 'haphpipe_demo'))

        # Check for executable
        sysutils.check_dependency("fastq-dump")

        # Demo command
        cmd1 = ['haphpipe_demo', 'haphpipe_demo']

        sysutils.command_runner([
            cmd1,
        ], 'demo')
    else:
        print(
            "Demo was run with --refonly. References are now in outdirectory: %s."
            % refs)