Esempio n. 1
0
def getMask(globs, cmds, vcf_file):
    # Get the sites to be masked into a bed file.

    mask_bedfile = os.path.join(globs['iterfadir'],
                                "iter-" + globs['iter-str'] + "-masksites.bed")
    if globs['diploid']:
        mask_bedfile = mask_bedfile.replace("-masksites.bed",
                                            "-diploid-masksites.bed")

    cmd = "zgrep \"\./\.\" " + vcf_file + " | awk '{{OFS=\"\t\"; if ($0 !~ /\#/); print $1, $2-1, $2}}' | bedtools merge -i - > " + mask_bedfile
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Get mask sites",
        'outfile': mask_bedfile,
        'logfile': "",
        'start': False
    }

    run = True
    if globs['resume']:
        if os.path.isfile(mask_bedfile) and os.stat(mask_bedfile).st_size != 0:
            PC.report_step(globs, cmds, cmd, "RESUME",
                           "previous output found: " + mask_bedfile)
            run = False

    if run:
        if not globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)

            if os.path.isfile(
                    mask_bedfile) and os.stat(mask_bedfile).st_size != 0:
                num_sites = str(len(open(mask_bedfile, "r").readlines()))
                PC.report_step(globs, cmds, cmd, "SUCCESS",
                               num_sites + " mask sites read: " + mask_bedfile)
            else:
                PC.report_step(
                    globs, cmds, cmd, "ERROR!",
                    "Mask sites file not found or empty: " + mask_bedfile)
                globs['exit-code'] = 1
                PC.endProg(globs)

        else:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)

    return mask_bedfile, cmds
Esempio n. 2
0
def varFilterManual(globs, cmds):
    # This function is meant for filtering SNPs from the called set for each iteration based on
    # an inpute VCF provided by -vcf. This implementation first unzips the iteration vcf file,
    # loops through it and for each SNP checks the set from the input VCF. Then it re-zips the
    # iteration vcf. This is very slow and meant only for small genomes/numbers of SNPs.
    # Specifically implemented for SARS-CoV-2 genomes.

    for scaff in globs['scaffolds']:
        if globs['dryrun']:
            PC.report_step(globs, cmds, "NA Filtering variants from input VCF",
                           "DRYRUN", globs['in-vcf'])
        else:
            PC.report_step(globs, cmds, "NA Filtering variants from input VCF",
                           "EXECUTING", globs['in-vcf'])
            filter_file = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz")
            filter_file_unzipped = filter_file.replace(".gz", "")
            os.system("gunzip " + filter_file)
            vcflines = [
                line.strip().split("\t") for line in open(filter_file_unzipped)
            ]
            # Unzip and read the iteration VCF file.

            num_filtered = 0
            for i in range(len(vcflines)):
                if vcflines[i][0].startswith("#"):
                    continue
                # Check each SNP in the VCF file; skip the header lines.

                for snp in globs['filter-sites']:
                    if vcflines[i][1] == snp[1] and vcflines[i][4] in snp[4]:
                        if vcflines[i][6] == "PASS":
                            vcflines[i][6] = "pseudoit"
                        elif "pseudoit" not in vcflines[i][6]:
                            vcflines[i][6] += "pseudoit"
                        num_filtered += 1
                # Check each SNP in the provided -vcf file. If it matches the current SNP, add the filter string to the
                # FILTER column.

            with open(filter_file_unzipped, "w") as new_vcf:
                for line in vcflines:
                    new_vcf.write("\t".join(line) + "\n")
            # Re-write the iteration VCF file.

            os.system("bgzip " + filter_file_unzipped)
            # Re-compress the iteration VCF file.

            PC.report_step(globs, cmds,
                           "NA " + str(num_filtered) + " sites filtered",
                           "SUCCESS", globs['in-vcf'])
Esempio n. 3
0
def getScaffs(cur_fa, globs, cmds, report_status=True):
    # Save the list of scaffolds/contigs/chromosomes from a FASTA file to a text file.

    cmd = "grep \">\" " + cur_fa + " | sed 's/>//g'"  # > " + globs['scaffs'];
    # grep the number of scaffolds in the reference... I guess this could also be done by just reading
    # the number of lines in the index file...

    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Get ref scaffold IDs",
        'outfile': "",
        'logfile': "",
        'start': False
    }
    # Add the grep command to the global commands dict.

    if not globs['dryrun']:
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        cmd_result = subprocess.run(cmd,
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
        cur_scaffs = list(filter(None,
                                 cmd_result.stdout.decode().split("\n")))
        globs['scaffolds'] = [
            scaff[:scaff.index(" ")] if " " in scaff else scaff
            for scaff in cur_scaffs
        ]
        PC.report_step(globs, cmds, cmd, "SUCCESS",
                       str(len(globs['scaffolds'])) + " scaffold IDs read")
    else:
        PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        globs['scaffolds'] = []
    # Run the grep command and check for errors..

    return cmds
Esempio n. 4
0
def indexCheck(cur_fa, globs, cmds):
    # Checks that the user has created the proper index files before running the program.

    ref_ext = PC.detectRefExt(cur_fa, globs)

    dictfile = cur_fa.replace(ref_ext, ".dict")
    cmd = "os.path.isfile(" + dictfile + ")"
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Checking ref indices",
        'outfile': "",
        'logfile': "",
        'start': False
    }
    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
    if not os.path.isfile(dictfile):
        PC.errorOut(
            "REF1",
            "Reference dictionary not found. Please run: picard CreateSequenceDictionary R=<ref>.fa O=<ref>.dict",
            globs)
    PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found", "")
    # Check for the reference dictionary file.

    faidxfile = cur_fa + ".fai"
    cmd = "os.path.isfile(" + faidxfile + ")"
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Checking ref indices",
        'outfile': "",
        'logfile': "",
        'start': False
    }
    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
    if not os.path.isfile(faidxfile):
        PC.errorOut(
            "REF2",
            "Reference index (samtools) not found. Please run: samtools faidx <ref>.fa",
            globs)
    PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found")
    # Check for the reference faidx file.

    if globs['mapper'] == "bwa":
        indexfiles = [
            cur_fa + ".amb", cur_fa + ".ann", cur_fa + ".bwt", cur_fa + ".pac",
            cur_fa + ".sa"
        ]
        cmd = "os.path.isfile(" + ",".join(indexfiles) + ")"
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Checking ref indices",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        if any(not os.path.isfile(f) for f in indexfiles):
            PC.errorOut(
                "REF3",
                "Reference index (bwa) not found. Please run: bwa index <ref>.fa",
                globs)
        PC.report_step(globs, cmds, cmd, "SUCCESS", "index files found")
    # Check for the bwa index files if --mapper is bwa.

    elif globs['mapper'] == "hisat2":
        indexfile = cur_fa + ".1.ht2"
        cmd = "os.path.isfile(" + indexfile + ")"
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Checking ref indices",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        if not os.path.isfile(indexfile):
            PC.errorOut(
                "REF3",
                "Reference index (hisat2) not found. Please run: hisat2-build <ref>.fa <ref>.fa",
                globs)
        PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found")
    # Check for the hisat2 index files if --mapper is hisat2.

    return cmds
Esempio n. 5
0
def mergeBam(globs, cmds, bamfiles):
    # Merge BAM files from different library types.

    cur_logfile = os.path.join(
        globs['iterlogdir'],
        "picard-merge-bam-iter-" + globs['iter-str'] + ".log")
    merged_bamfile = os.path.join(
        globs['iterbamdir'], "merged-iter-" + globs['iter-str'] + ".bam.gz")
    # Get the log file and merged bam file name to output to.

    if len(bamfiles) > 1:
        # We only need to run picard if there are multiple bam files from mapping

        merge_cmd = globs['picard-path'] + " MergeSamFiles "
        for bamfile in bamfiles:
            merge_cmd += "I=" + bamfile + " "
        if globs['tmpdir'] != "System default.":
            merge_cmd += "TMP_DIR=\"" + globs['tmpdir'] + "\" "
        if not globs['mkdups']:
            merge_cmd += "CREATE_INDEX=true "
        merge_cmd += "USE_THREADING=TRUE VALIDATION_STRINGENCY=LENIENT O=" + merged_bamfile
        # Generate the MergeSamFiles command.

        cmds[merge_cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Merge BAM files",
            'outfile': merged_bamfile,
            'logfile': cur_logfile,
            'start': False
        }
        # Add the MergeSamFiles command to the global cmds dict.

        exit_flag = PC.runCMD(merge_cmd, globs, cmds, True)
        PC.exitCheck(exit_flag, globs)
        # Run the command and check for errors.

    else:
        # If there was only one bam file from mapping we don't need to merge, just move it to the expected location.

        merge_cmd = "mv " + bamfiles[0] + " " + merged_bamfile
        # Generate the mv command.

        cmds[merge_cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Rename BAM file",
            'outfile': merged_bamfile,
            'logfile': "",
            'start': False
        }
        # Add the mv command to the global commands dict.

        if globs['dryrun']:
            PC.report_step(globs, cmds, merge_cmd, "DRYRUN")
        else:
            PC.report_step(globs, cmds, merge_cmd, "EXECUTING")
            os.system(merge_cmd)
            if os.path.isfile(merged_bamfile):
                PC.report_step(globs, cmds, merge_cmd, "SUCCESS")
            else:
                PC.report_step(globs, cmds, merge_cmd, "ERROR")
                PC.errorOut("PIMAP1", "Error renaming BAM file.", globs)
        # Run the command and check for errors.

    return merged_bamfile, cmds
Esempio n. 6
0
def varFilter(globs, cmds, cur_ref):
    # Run the command to filter variants from a VCF file based on input filters. Default: "MQ < 30.0 || DP < 5 || DP > 60"

    bcftools_cmds = {}
    for scaff in globs['scaffolds']:
        # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']):
        #     cur_logfile = os.path.join(globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + "-snps.log");
        #     vcf_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps.vcf.gz");
        #     filter_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps-filter.vcf.gz");
        # else:
        cur_logfile = os.path.join(
            globs['itervcflogdir'],
            "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + ".log")
        vcf_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")
        filter_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz")

        bcftools_cmd = globs['bcftools-path'] + " filter -m+ -e " + globs[
            'filter'] + " -s pseudoit --IndelGap 5 -Oz -o " + filter_file + " " + vcf_file

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[bcftools_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Filter VCF " + scaff,
            'outfile': filter_file,
            'logfile': cur_logfile,
            'start': False,
            "vcffile": vcf_file
        }
        bcftools_cmds[bcftools_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Filter VCF " + scaff,
            'outfile': filter_file,
            'logfile': cur_logfile,
            'start': False,
            "vcffile": vcf_file
        }

    if globs['dryrun']:
        cmd_num = PC.getCMDNum(globs, len(cmds))
        bcftools_skeleton_cmd = globs[
            'bcftools-path'] + " filter -m+ -e " + globs[
                'filter'] + " -s pseudoit --IndelGap 5 -Oz -o <filtered vcf> <input vcf>"
        cmds[bcftools_skeleton_cmd] = {
            'cmd-num': cmd_num,
            'desc':
            str(globs['num-procs']) + " bcftools filter procs in parallel",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, bcftools_skeleton_cmd, "DRYRUN",
                       bcftools_skeleton_cmd)

    else:
        pool = mp.Pool(processes=globs['filter-procs'])
        for result in pool.starmap(PC.runCMD,
                                   ((bcftools_cmd, globs, cmds, True)
                                    for bcftools_cmd in bcftools_cmds)):
            if result:
                pool.terminate()
                globs['exit-code'] = 1
                PC.endProg(globs)
        pool.terminate()

    return cmds
Esempio n. 7
0
def genConsensus(globs, cmds, vcf_file, cur_ref):
    # Run the command to generate a consensus FASTA file from the reference and the variants.

    cmd = "getConsCase()"
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Determining case of first base",
        'outfile': "",
        'logfile': "",
        'start': False
    }

    bcftools_cmd = globs[
        'bcftools-path'] + " consensus -f " + cur_ref + " -o " + globs[
            'iter-final-fa']
    if globs['last-iter'] and globs['indels']:
        bcftools_cmd += " -c " + globs['iter-final-chain']
    if globs['last-iter'] and globs['diploid']:
        bcftools_cmd += " -I "
    bcftools_cmd += " -e \"FILTER='pseudoit' || FILTER='IndelGap'\" " + vcf_file
    cmds[bcftools_cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Generating consensus",
        'outfile': globs['iter-final-fa'],
        'logfile': globs['iter-consensus-log'],
        'start': False
    }

    run_flag = True
    if globs['resume']:
        run_flag = PC.runCheck(bcftools_cmd, cmds, globs)

    #### RUN RUNCHECK FIRST

    first_lower = False
    if globs['dryrun']:
        PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        first_lower, linestr_orig, linestr_repl = True, "a", "A"
    elif run_flag:
        PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
        first_lower, linestr_orig, linestr_repl = getConsCase(cur_ref)
        PC.report_step(globs, cmds, cmd, "SUCCESS",
                       "First base: " + linestr_orig[0])
    # This first_lower stuff is a hack to deal with bcftools consensus using the case of the first base in the reference fasta to inject variants.
    # Possibly resolved: https://github.com/samtools/bcftools/issues/1150#issuecomment-582407490
    # Need to test and make sure it is in official release before I remove this hack.

    if first_lower:
        cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + cur_ref
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Changing first ref base to upper case",
            'outfile': "",
            'logfile': "",
            'start': False
        }

        if globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        elif run_flag:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)
            PC.report_step(globs, cmds, cmd, "SUCCESS",
                           "First base converted to upper case")
    # Part of first_lower hack.

    exit_flag = PC.runCMD(bcftools_cmd, globs, cmds, True)
    # Consensus command
    PC.exitCheck(exit_flag, globs)
    # End the program if an error is encountered

    if first_lower:
        cmd = "sed -i '2 s/" + linestr_repl + "/" + linestr_orig + "/g' " + cur_ref
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Reverting case of first ref base",
            'outfile': "",
            'logfile': "",
            'start': False
        }

        if globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        elif run_flag:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)
            PC.report_step(globs, cmds, cmd, "SUCCESS",
                           "First base reverted to original case")

        if not globs['dryrun']:
            first_lower, linestr_orig, linestr_repl = getConsCase(
                globs['iter-final-fa'])

        cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + globs[
            'iter-final-fa']
        cmds[cmd] = {
            'cmd-num': PC.getCMDNum(globs, len(cmds)),
            'desc': "Reverting case of first consensus base",
            'outfile': "",
            'logfile': "",
            'start': False
        }

        if globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
        elif run_flag:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)
            PC.report_step(globs, cmds, cmd, "SUCCESS",
                           "First base reverted to original case")
    # Part of first_lower hack.

    globs['consensus-file'] = globs['iter-final-fa']

    return cmds, globs
Esempio n. 8
0
def mapping(globs):

    globs = PC.getIterStr(globs)
    PC.printWrite(
        globs['logfilename'], globs['log-v'], "#\n# " + "=" * 51 +
        " ITERATION " + globs['iter-str'] + " STARTING! " + "=" * 50)

    globs['iterstarttime'] = PC.report_step(globs, "", start=True)
    if globs['iteration'] == 1:
        globs['progstarttime'] = globs['iterstarttime']

    cmds = {}

    cur_ref = PC.getRef(globs)
    globs['last-iter'] = False
    if globs['iteration'] == globs['num-iters']:
        globs['last-iter'] = True
    # Iteration prep

    globs['iterdir'] = os.path.join(globs['outdir'],
                                    "iter-" + globs['iter-str'])
    globs['iterbamdir'] = os.path.join(globs['iterdir'], "bam")
    globs['itervcfdir'] = os.path.join(globs['iterdir'], "vcf")
    globs['itervcfdir'] = os.path.join(globs['iterdir'], "vcf")
    globs['iterfadir'] = os.path.join(globs['iterdir'], "fa")
    globs['iterlogdir'] = os.path.join(globs['iterdir'], "logs")
    for d in [
            globs['iterdir'], globs['iterbamdir'], globs['itervcfdir'],
            globs['iterfadir'], globs['iterlogdir']
    ]:
        if not os.path.isdir(d):
            if globs['map-only'] and d not in [
                    globs['iterbamdir'], globs['iterlogdir']
            ]:
                continue
            os.makedirs(d)
    if globs['last-iter']:
        globs['itervcfscaffdir'] = os.path.join(globs['itervcfdir'],
                                                "gvcf-scaff")
        globs['itervcflogdir'] = os.path.join(globs['itervcfdir'], "gvcf-logs")
    else:
        globs['itervcfscaffdir'] = os.path.join(globs['itervcfdir'],
                                                "vcf-scaff")
        globs['itervcflogdir'] = os.path.join(globs['itervcfdir'], "vcf-logs")
    if not globs['map-only']:
        for d in [globs['itervcfscaffdir'], globs['itervcflogdir']]:
            if not os.path.isdir(d):
                os.makedirs(d)
    # Make directories for current iteration

    if globs['bam'] and globs['iteration'] == 1:
        globs['iter-final-bam-log'] = "NA"
        globs['iter-final-bam'] = globs['bam']
    else:
        if globs['mkdups']:
            globs['iter-final-bam-log'] = os.path.join(
                globs['iterlogdir'],
                "picard-mkdup-iter-" + globs['iter-str'] + ".log")
            globs['iter-final-bam'] = os.path.join(
                globs['iterbamdir'],
                "merged-rg-mkdup-iter-" + globs['iter-str'] + ".bam.gz")
        else:
            globs['iter-final-bam-log'] = os.path.join(
                globs['iterlogdir'],
                "picard-merge-bam-iter-" + globs['iter-str'] + ".log")
            globs['iter-final-bam'] = os.path.join(
                globs['iterbamdir'],
                "merged-iter-" + globs['iter-str'] + ".bam.gz")
        # If --nomkdup is set, the final BAM file for each iteration should not have the mkdup suffix.
    # Final BAM file for this iteration

    if globs['last-iter']:
        globs['iter-gather-vcf-log'] = os.path.join(
            globs['iterlogdir'],
            "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log")
        globs['iter-gather-vcf'] = os.path.join(
            globs['itervcfdir'],
            "iter-" + globs['iter-str'] + "-filter.vcf.gz")

        if globs['indels']:
            globs['iter-final-vcf-log'] = globs['iter-gather-vcf-log']
            globs['iter-final-vcf'] = globs['iter-gather-vcf']
        else:
            globs['iter-final-vcf-log'] = os.path.join(
                globs['iterlogdir'],
                "gatk-selectsnps-iter-" + globs['iter-str'] + ".log")
            globs['iter-final-vcf'] = os.path.join(
                globs['itervcfdir'],
                "iter-" + globs['iter-str'] + "-filter-snps.vcf.gz")
    else:
        globs['iter-gather-vcf-log'] = os.path.join(
            globs['iterlogdir'],
            "gatk-gathervcfs-iter-" + globs['iter-str'] + "-intermediate.log")
        globs['iter-gather-vcf'] = os.path.join(
            globs['itervcfdir'],
            "iter-" + globs['iter-str'] + "-filter-intermediate.vcf.gz")

        globs['iter-final-vcf-log'] = os.path.join(
            globs['iterlogdir'],
            "gatk-selectsnps-iter-" + globs['iter-str'] + "-intermediate.log")
        globs['iter-final-vcf'] = os.path.join(
            globs['itervcfdir'],
            "iter-" + globs['iter-str'] + "-filter-intermediate-snps.vcf.gz")
    # Final VCF file for this iteration

    if globs['last-iter']:
        if globs['indels']:
            globs['iter-consensus-log'] = os.path.join(
                globs['iterlogdir'],
                "bcftools-consensus-iter-" + globs['iter-str'] + "-final.log")
            globs['iter-final-chain'] = os.path.join(
                globs['iterfadir'],
                "iter-" + globs['iter-str'] + "-final.chain")
            globs['iter-final-fa'] = os.path.join(
                globs['iterfadir'], "iter-" + globs['iter-str'] + "-final.fa")

        else:
            globs['iter-consensus-log'] = os.path.join(
                globs['iterlogdir'], "bcftools-consensus-iter-" +
                globs['iter-str'] + "-snps-final.log")
            globs['iter-final-chain'] = os.path.join(
                globs['iterfadir'],
                "iter-" + globs['iter-str'] + "-snps-final.chain")
            globs['iter-final-fa'] = os.path.join(
                globs['iterfadir'],
                "iter-" + globs['iter-str'] + "-snps-final.fa")

        if globs['diploid']:
            globs['iter-consensus-log'] = globs['iter-consensus-log'].replace(
                "-final.log", "-diploid-final.log")
            globs['iter-final-chain'] = globs['iter-final-chain'].replace(
                "-final.chain", "-diploid-final.chain")
            globs['iter-final-fa'] = globs['iter-final-fa'].replace(
                "-final.fa", "-diploid-final.fa")

    else:
        globs['iter-consensus-log'] = os.path.join(
            globs['iterlogdir'], "bcftools-consensus-iter-" +
            globs['iter-str'] + "-snps-intermediate.log")
        globs['iter-final-chain'] = os.path.join(
            globs['iterfadir'],
            "iter-" + globs['iter-str'] + "-snps-intermediate.chain")
        globs['iter-final-fa'] = os.path.join(
            globs['iterfadir'],
            "iter-" + globs['iter-str'] + "-snps-intermediate.fa")
    # Final FASTA files for this iteration
    # Output files for current iteration

    if globs['iteration'] == 1:
        cmds = piref.indexCheck(globs['ref'], globs, cmds)
        cmds = piref.getScaffs(globs['ref'], globs, cmds)
    # Check that all index files have been created and get the scaffold IDs from the reference FASTA

    if globs['bam'] and globs['iteration'] == 1:
        do_mapping = False
    elif globs['resume']:
        do_mapping = PC.prevCheck(globs['iter-final-bam'],
                                  globs['iter-final-bam-log'], globs)
    else:
        do_mapping = True

    if do_mapping:
        do_varcalling = True
    elif globs['resume']:
        do_varcalling = PC.prevCheck(globs['iter-final-vcf'],
                                     globs['iter-final-vcf-log'], globs)
    else:
        do_varcalling = True

    if do_varcalling:
        do_consensus = True
    elif globs['resume']:
        do_consensus = PC.prevCheck(globs['iter-final-fa'],
                                    globs['iter-consensus-log'], globs)
    else:
        do_consensus = True
    # CHECK WHICH STEPS WE NEED TO PERFORM

    statstr = "EXECUTING"
    if globs['resume']:
        statstr = "RESUME"
    if globs['dryrun']:
        statstr = "DRYRUN"
    # Status for the main step reports

    if globs['iteration'] != 1:
        cmds = piref.indexFa(globs, cmds, cur_ref)
    else:
        if globs['in-vcf']:
            PC.report_step(
                globs, cmds, "NA--00   Reading input VCF", statstr,
                "Reading input SNPs to ignore during variant calling.")
            if not globs['dryrun']:
                globs['filter-sites'] = PC.readVCF(globs['in-vcf'])
        # If a VCF file has been provided to filter SNPs with -vcf, read those SNPs here. This is slow, meant for a small number of SNPs
        # in small genomes only.
    # INDEX FASTA IF NOT FIRST ITERATION

    if do_mapping:
        PC.report_step(globs, cmds, "NA--01   Read mapping", statstr,
                       "Mapping reads and post-processing.")

        PC.report_step(globs, cmds, "NA--01   Read mapping", statstr,
                       "Getting read groups.")
        pimap.getRG(globs)

        if globs['mapper'] == "bwa":
            bamfiles, cmds = pimap.BWA(globs, cmds, cur_ref)
        # If --mapper is bwa
        if globs['mapper'] == "hisat2":
            bamfiles, cmds = pimap.hisat2(globs, cmds, cur_ref)
        # If --mapper is hisat2
        # READ MAPPING

        #rg_bamfile, cmds = varprep.addRG(globs, cmds, bamfiles);
        # ADD READ GROUPS

        merged_bamfile, cmds = pimap.mergeBam(globs, cmds, bamfiles)
        # MERGE BAM FILES also sorts

        if globs['mkdups']:
            cmds = pimap.markDups(globs, cmds, merged_bamfile)
        # MARK DUPLICATES

    elif globs['bam'] and globs['iteration'] == 1:
        PC.report_step(
            globs, cmds, "NA--01   Read mapping", "BAM",
            "initial BAM file provided, skipping all mapping steps: " +
            globs['iter-final-bam'])
    else:
        PC.report_step(
            globs, cmds, "NA--01   Read mapping", "RESUME",
            "previous processed BAM file found, skipping all mapping steps: " +
            globs['iter-final-bam'])
    ## READ MAPPING STEPS

    #cmds = varprep.indexBAM(globs, cmds);
    # INDEX BAM FILE
    # Now done during MarkDuplicates

    if globs['map-only']:
        PC.report_step(
            globs, cmds, "NA--04   Iteration cleanup", statstr,
            "Removing intermediate files based on --keep* options.")
        cmds = cleanUp(globs, cmds)

        PC.printWrite(
            globs['logfilename'], globs['log-v'], "#\n# " + "=" * 51 +
            " ITERATION " + globs['iter-str'] + " COMPLETE! " + "=" * 50)
        PC.report_step(globs, "", end=True)

        globs['iteration'] += 1
        return globs
    # This stops the program after the first iteration of mapping if --maponly is set.

    if do_varcalling:
        PC.report_step(globs, cmds, "NA--02   Variant calling", statstr,
                       "Calling and post-processing variants.")
        cmds = varcall.haplotypeCaller(globs, cmds, cur_ref,
                                       globs['iter-final-bam'])
        # HAPLOTYPECALLER

        if globs['last-iter']:
            cmds = varcall.genotypeGVCFs(globs, cmds, cur_ref)
        # GENOTYPE GVCFS FOR LAST ITER

        # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']):
        #     cmds = varpost.selectSNPs(globs, cmds);
        # SELECT SNPs if it is not the last iteration, or if it is and the final output should not contain indels

        cmds = varpost.varFilter(globs, cmds, cur_ref)
        # FILTER VCFs

        if globs['in-vcf']:
            varpost.varFilterManual(globs, cmds)
        # If a vcf file has been provided for filtering SNPs with -vcf, do that filtering here.

        cmds = varpost.gatherVCFs(globs, cmds)
        # COMBINE VCF

        cmds = varpost.indexVCF(globs, cmds, globs['iter-gather-vcf'])
        # INDEX VCF
    else:
        PC.report_step(
            globs, cmds, "NA--02   Variant calling", "RESUME",
            "previous processed VCF file found, skipping all variant calling steps: "
            + globs['iter-final-vcf'])
    ## VARIANT CALLING STEPS

    if do_consensus:
        PC.report_step(globs, cmds, "NA--03   Consensus generation", statstr,
                       "Generating consensus FASTA.")
        if globs['last-iter']:
            mask_bedfile, cmds = con.getMask(globs, cmds,
                                             globs['iter-gather-vcf'])
            # GET MASK SITES

            cur_ref, cmds = con.maskFa(globs, cmds, mask_bedfile, cur_ref)
            # MASK PREVIOUS REFERENCE

        if not globs['last-iter'] or (globs['last-iter']
                                      and not globs['indels']):
            cmds = varpost.selectSNPs(globs, cmds, globs['iter-gather-vcf'])
            # SELECT SNPs FROM VCF IF IT IS NOT THE LAST ITERATION OR IF --noindels IS SET

            cmds = varpost.indexVCF(globs, cmds, globs['iter-final-vcf'])
            # INDEX FINAL VCF

        cmds, globs = con.genConsensus(globs, cmds, globs['iter-final-vcf'],
                                       cur_ref)
        # GENERATE CONSENSUS
    else:
        PC.report_step(
            globs, cmds, "NA--03   Consensus generation", "RESUME",
            "previous processed consensus FASTA file found, skipping all consensus generation steps: "
            + globs['iter-final-fa'])
        globs['consensus-file'] = globs['iter-final-fa']
    ## CONSENSUS STEPS

    PC.report_step(globs, cmds, "NA--04   Iteration cleanup", statstr,
                   "Removing intermediate files based on --keep* options.")
    cmds = cleanUp(globs, cmds)

    PC.printWrite(
        globs['logfilename'], globs['log-v'], "#\n# " + "=" * 51 +
        " ITERATION " + globs['iter-str'] + " COMPLETE! " + "=" * 50)
    PC.report_step(globs, "", end=True)

    globs['iteration'] += 1
    return globs
Esempio n. 9
0
def cleanUp(globs, cmds):
    i = globs['iter-str']
    prev_i = str(int(i) - 1)
    if len(prev_i) == 1:
        prev_iter = "0" + prev_i

    possible_map_files = {
        'iter-' + i + "-dupmets.txt": 2,
        "merged-iter-" + i + ".bam.gz": 2,
        "merged-rg-iter-" + i + ".bam.gz": 2,
        "merged-rg-mkdup-iter-" + i + ".bam.gz": 1,
        "merged-rg-mkdup-iter-" + i + ".bam.gz.bai": 1,
        "pe-iter-" + i + ".bam.gz": 2,
        "pem-iter-" + i + ".bam.gz": 2,
        "se-iter-" + i + ".bam.gz": 2
    }

    possible_vcf_files = {
        "vcf-scaff": 2,
        "gvcf-scaff": 2,
        "iter-" + i + "-filter-intermediate.vcf.gz": 2,
        "iter-" + i + "-filter-intermediate.vcf.gz.tbi": 2,
        "iter-" + i + "-filter-intermediate-snps.vcf.gz": 1,
        "iter-" + i + "-filter-intermediate-snps.vcf.gz.tbi": 1,
        "iter-" + i + "-gathervcfs-params.txt": 2,
        "iter-" + i + "-filter.vcf.gz": 1,
        "iter-" + i + "-filter.vcf.gz.tbi": 1,
        "iter-" + i + "-filter-snps.vcf.gz": 1,
        "iter-" + i + "-filter-snps.vcf.gz.tbi": 1
    }

    possible_fa_files = [
        "iter-" + prev_i + "-masked.fa", "iter-" + prev_i + "snps-masked.fa",
        "iter-" + i + "-snps-intermediate.dict",
        "iter-" + i + "-snps-intermediate.fa",
        "iter-" + i + "-snps-intermediate.fa.amb",
        "iter-" + i + "-snps-intermediate.fa.ann",
        "iter-" + i + "-snps-intermediate.fa.bwt",
        "iter-" + i + "-snps-intermediate.fa.fai",
        "iter-" + i + "-snps-intermediate.fa.pac",
        "iter-" + i + "-snps-intermediate.fa.sa"
    ]

    if globs['last-iter'] and globs['keeplevel'] == 0:
        globs['keeplevel'] = 1

    for f in possible_map_files:
        if possible_map_files[f] > globs['keeplevel']:
            full_f = os.path.join(globs['iterbamdir'], f)
            if os.path.isfile(full_f):
                cmd = "os.remove(" + full_f + ")"
                cmds[cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Removing file",
                    'outfile': "",
                    'logfile': "",
                    'start': False
                }
                if globs['dryrun']:
                    PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
                else:
                    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
                    os.remove(full_f)

    for f in possible_vcf_files:
        if possible_vcf_files[f] > globs['keeplevel']:
            full_f = os.path.join(globs['itervcfdir'], f)

            if os.path.isfile(full_f):
                cmd = "os.remove(" + full_f + ")"
                cmds[cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Removing file",
                    'outfile': "",
                    'logfile': "",
                    'start': False
                }
                if globs['dryrun']:
                    PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
                else:
                    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
                    os.remove(full_f)
            elif os.path.isdir(full_f):
                cmd = "shutil.rmtree(" + full_f + ")"
                cmds[cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Removing directory",
                    'outfile': "",
                    'logfile': "",
                    'start': False
                }
                if globs['dryrun']:
                    PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)
                else:
                    PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
                shutil.rmtree(full_f)

    return cmds
Esempio n. 10
0
def haplotypeCaller(globs, cmds, cur_ref, dup_bamfile):
    # Run HaplotypeCaller for each scaffold.

    gatk_cmds = {}
    for scaff in globs['scaffolds']:
        cur_logfile = os.path.join(
            globs['itervcflogdir'], "gatk-haplotypcaller-" + scaff + "-iter-" +
            globs['iter-str'] + ".log")
        if globs['last-iter']:
            vcffile = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz")
        else:
            vcffile = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")

        gatk_cmd = globs[
            'gatk-path'] + " HaplotypeCaller -R " + cur_ref + " -I " + dup_bamfile + " -L \"" + scaff + "\" -stand-call-conf 30 --native-pair-hmm-threads " + str(
                globs['gatk-t'])
        if globs['last-iter']:
            gatk_cmd += " -ERC GVCF"
        # The final iteration outputs GVCFs to properly emit all sites
        gatk_cmd += " -O " + vcffile

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "HaplotypeCaller " + scaff,
            'outfile': vcffile,
            'logfile': cur_logfile,
            'start': False
        }
        gatk_cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "HaplotypeCaller " + scaff,
            'outfile': vcffile,
            'logfile': cur_logfile,
            'start': False
        }

    if globs['dryrun']:
        cmd_num = PC.getCMDNum(globs, len(cmds))
        gatk_skeleton_cmd = globs[
            'gatk-path'] + " HaplotypeCaller -R <reference fasta> -I <BAM file> -L \"<scaffold>\" -stand-call-conf 30 --native-pair-hmm-threads " + str(
                globs['gatk-t'])
        if globs['last-iter']:
            gatk_skeleton_cmd += " -ERC GVCF"
        # The final iteration outputs GVCFs to properly emit all sites
        gatk_skeleton_cmd += " -O <vcf file>"
        cmds[gatk_skeleton_cmd] = {
            'cmd-num': cmd_num,
            'desc':
            str(globs['gatk-procs']) + " HaplotypeCaller procs in parallel",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, gatk_skeleton_cmd, "DRYRUN",
                       gatk_skeleton_cmd)

    else:
        pool = mp.Pool(processes=globs['gatk-procs'])
        for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True)
                                                  for gatk_cmd in gatk_cmds)):
            if exit_flag:
                pool.terminate()
                globs['exit-code'] = 1
                PC.endProg(globs)
        pool.terminate()

    return cmds