Beispiel #1
0
def getMask(globs, cmds, vcf_file):
    # Get the sites to be masked into a bed file.

    mask_bedfile = os.path.join(globs['iterfadir'],
                                "iter-" + globs['iter-str'] + "-masksites.bed")
    if globs['diploid']:
        mask_bedfile = mask_bedfile.replace("-masksites.bed",
                                            "-diploid-masksites.bed")

    cmd = "zgrep \"\./\.\" " + vcf_file + " | awk '{{OFS=\"\t\"; if ($0 !~ /\#/); print $1, $2-1, $2}}' | bedtools merge -i - > " + mask_bedfile
    cmds[cmd] = {
        'cmd-num': PC.getCMDNum(globs, len(cmds)),
        'desc': "Get mask sites",
        'outfile': mask_bedfile,
        'logfile': "",
        'start': False
    }

    run = True
    if globs['resume']:
        if os.path.isfile(mask_bedfile) and os.stat(mask_bedfile).st_size != 0:
            PC.report_step(globs, cmds, cmd, "RESUME",
                           "previous output found: " + mask_bedfile)
            run = False

    if run:
        if not globs['dryrun']:
            PC.report_step(globs, cmds, cmd, "EXECUTING", cmd)
            os.system(cmd)

            if os.path.isfile(
                    mask_bedfile) and os.stat(mask_bedfile).st_size != 0:
                num_sites = str(len(open(mask_bedfile, "r").readlines()))
                PC.report_step(globs, cmds, cmd, "SUCCESS",
                               num_sites + " mask sites read: " + mask_bedfile)
            else:
                PC.report_step(
                    globs, cmds, cmd, "ERROR!",
                    "Mask sites file not found or empty: " + mask_bedfile)
                globs['exit-code'] = 1
                PC.endProg(globs)

        else:
            PC.report_step(globs, cmds, cmd, "DRYRUN", cmd)

    return mask_bedfile, cmds
Beispiel #2
0
def indexFa(globs, cmds, cur_ref):
    # Creates all reference fasta index files for subsequent iterations. For the first
    # iteration these are assumed to be created before the program is run.

    indices = ['dict', 'faidx', 'index']
    # The types of indices needed: .dict from picard, .fai from samtools, and the current --mapper index files.

    index_cmds = {}
    ref_ext = PC.detectRefExt(cur_ref, globs)
    # Detect whether the reference is compressed or not.

    for step in indices:
        if step == 'dict':
            cur_logfile = os.path.join(
                globs['iterlogdir'],
                "picard-dict-iter-" + globs['iter-str'] + ".log")
            dict_file = cur_ref.replace(ref_ext, ".dict")

            if os.path.isfile(dict_file) and globs['overwrite']:
                os.system("rm " + dict_file)

            picard_cmd = globs[
                'picard-path'] + " CreateSequenceDictionary R=" + cur_ref + " O=" + dict_file
            cmds[picard_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference dict",
                'outfile': dict_file,
                'logfile': cur_logfile,
                'start': False
            }
            index_cmds[picard_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference dict",
                'outfile': dict_file,
                'logfile': cur_logfile,
                'start': False
            }
        # Create the reference dictionary by running picard CreateSequenceDictionary

        if step == "faidx":
            cur_logfile = os.path.join(
                globs['iterlogdir'],
                "samtools-faidx-iter-" + globs['iter-str'] + ".log")
            faidx_file = cur_ref + ".fai"

            faidx_cmd = globs['samtools-path'] + " faidx " + cur_ref
            cmds[faidx_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference faidx",
                'outfile': faidx_file,
                'logfile': cur_logfile,
                'start': False
            }
            index_cmds[faidx_cmd] = {
                'cmd-num': PC.getCMDNum(globs, len(cmds)),
                'desc': "Create reference faidx",
                'outfile': faidx_file,
                'logfile': cur_logfile,
                'start': False
            }
        # Create the reference index by running samtools faidx

        if step == "index":
            if globs['mapper'] == "bwa":
                cur_logfile = os.path.join(
                    globs['iterlogdir'],
                    "bwa-index-iter-" + globs['iter-str'] + ".log")
                index_files = [
                    cur_ref + ".amb", cur_ref + ".ann", cur_ref + ".bwt",
                    cur_ref + ".pac", cur_ref + ".sa"
                ]

                index_cmd = globs['map-path'] + " index " + cur_ref
                cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create BWA reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
                index_cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create BWA reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
            # Create the reference index by running bwa index if --mapper is bwa

            elif globs['mapper'] == "hisat2":
                cur_logfile = os.path.join(
                    globs['iterlogdir'],
                    "hisat2-build-index-iter-" + globs['iter-str'] + ".log")
                index_file = cur_ref + ".ht"

                index_cmd = globs[
                    'mapper-path'] + "-build " + cur_ref + " " + cur_ref
                cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create hisat2 reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
                index_cmds[index_cmd] = {
                    'cmd-num': PC.getCMDNum(globs, len(cmds)),
                    'desc': "Create hisat2 reference index",
                    'outfile': "",
                    'logfile': cur_logfile,
                    'start': False
                }
            # Create the reference index by running hisat2-build if --mapper is hisat2

    index_procs = min(3, globs['num-procs'])
    pool = mp.Pool(processes=index_procs)
    for result in pool.starmap(PC.runCMD, ((index_cmd, globs, cmds, True)
                                           for index_cmd in index_cmds)):
        if result:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()
    # Run the index commands in parallel and check for errors.

    return cmds


#############################################################################
Beispiel #3
0
def BWA(globs, cmds, cur_ref):
    # Map a set of reads with BWA mem.

    bwa_cmds, bamfiles = {}, []
    for lib_type in globs['libs']:
        # Generate a BWA command for each input fastq type.

        cur_logfile = os.path.join(
            globs['iterlogdir'],
            "bwa-mem-" + lib_type + "-iter-" + globs['iter-str'] + ".log")
        bamfile = os.path.join(
            globs['iterbamdir'],
            lib_type + "-iter-" + globs['iter-str'] + ".bam.gz")
        bamfiles.append(bamfile)
        # Get the bam file and log file for the current fastq file.

        rg_fields = ["ID", "PL", "PU", "LB", "SM"]
        rg_str = ["@RG"] + [
            field + ":" + globs['rg'][field] for field in rg_fields
        ]
        rg_str = "\\t".join(rg_str)
        # Gets the read group info from globs and parses it for BWA's -R option

        bwa_cmd = globs['mapper-path'] + " mem -t " + str(
            globs['map-t']
        ) + " -M -R '" + rg_str + "' " + cur_ref + " " + globs['libs'][lib_type]
        bwa_cmd += " | " + globs['samtools-path'] + " sort"
        bwa_cmd += " | " + globs['samtools-path'] + " view -bh -"
        bwa_cmd += " > " + bamfile
        # Generate the bwa mem command for the current fastq file, including passing output to samtools for sorting and
        # converting to .bam.

        cmd_num = PC.getCMDNum(globs, len(cmds))
        # Get the current command number for the log.

        cmds[bwa_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'outfile': bamfile,
            'logfile': cur_logfile,
            'start': False
        }
        # Save the bwa mem command to the global cmds dict.

        bwa_cmds[bwa_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'logfile': cur_logfile,
            'start': False
        }
        # Save the bwa mem command to the bwa_cmds dict.
    # Prepare the BWA commands for each library

    pool = mp.Pool(processes=globs['map-procs'])
    for result in pool.starmap(PC.runCMD, ((bwa_cmd, globs, cmds, True)
                                           for bwa_cmd in bwa_cmds)):
        if result:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()
    # Run the BWA commands across multiple processors, if specified
    # End the program if an error is encountered

    return bamfiles, cmds
Beispiel #4
0
def hisat2(globs, cmds, cur_ref):
    # Map a set of reads with BWA mem.

    hisat2_cmds, bamfiles = {}, []
    for lib_type in globs['libs']:
        # Generate a hisat2 command for each input fastq type.

        cur_logfile = os.path.join(
            globs['iterlogdir'],
            "hisat2-" + lib_type + "-iter-" + globs['iter-str'] + ".log")
        bamfile = os.path.join(
            globs['iterbamdir'],
            lib_type + "-iter-" + globs['iter-str'] + ".bam.gz")
        bamfiles.append(bamfile)
        # Get the bam file and log file for the current fastq file.

        rg_fields = ["ID", "PL", "PU", "LB", "SM"]
        # The read group fields to add to the output bam.

        hisat2_cmd = globs['mapper-path']
        for field in rg_fields:
            hisat2_cmd += " --rg " + field + ":" + globs['rg'][field]
        hisat2_cmd += " -p " + str(globs['map-t'])
        hisat2_cmd += " -x " + cur_ref
        if lib_type == 'pe':
            hisat2_cmd += " -1 " + globs['libs'][lib_type].split(" ")[0]
            hisat2_cmd += " -2 " + globs['libs'][lib_type].split(" ")[1]
        else:
            hisat2_cmd += " -U " + globs['libs'][lib_type]
        hisat2_cmd += " | " + globs['samtools-path'] + " sort"
        hisat2_cmd += " | " + globs['samtools-path'] + " view -bh -"
        hisat2_cmd += " > " + bamfile
        # Generate the hisat2 command, including adding read group info with --rg, and passing output to samtools for sorting
        # converting to .bam.

        cmd_num = PC.getCMDNum(globs, len(cmds))
        # Get the current command number for the log.

        cmds[hisat2_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'outfile': bamfile,
            'logfile': cur_logfile,
            'start': False
        }
        # Save the hisat command to the global cmds dict.

        hisat2_cmds[hisat2_cmd] = {
            'cmd-num': cmd_num,
            'desc': "BWA " + lib_type + " read mapping",
            'logfile': cur_logfile,
            'start': False
        }
        # Save the hisat2 command to the bwa_cmds dict.
    # Prepare the hisat2 commands for each fastq type

    pool = mp.Pool(processes=globs['map-procs'])
    for result in pool.starmap(PC.runCMD, ((hisat2_cmd, globs, cmds, True)
                                           for hisat2_cmd in hisat2_cmds)):
        if result:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()
    # Run the hisat2 commands across multiple processors, if specified
    # End the program if an error is encountered

    return bamfiles, cmds
Beispiel #5
0
def varFilter(globs, cmds, cur_ref):
    # Run the command to filter variants from a VCF file based on input filters. Default: "MQ < 30.0 || DP < 5 || DP > 60"

    bcftools_cmds = {}
    for scaff in globs['scaffolds']:
        # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']):
        #     cur_logfile = os.path.join(globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + "-snps.log");
        #     vcf_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps.vcf.gz");
        #     filter_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps-filter.vcf.gz");
        # else:
        cur_logfile = os.path.join(
            globs['itervcflogdir'],
            "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + ".log")
        vcf_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")
        filter_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz")

        bcftools_cmd = globs['bcftools-path'] + " filter -m+ -e " + globs[
            'filter'] + " -s pseudoit --IndelGap 5 -Oz -o " + filter_file + " " + vcf_file

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[bcftools_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Filter VCF " + scaff,
            'outfile': filter_file,
            'logfile': cur_logfile,
            'start': False,
            "vcffile": vcf_file
        }
        bcftools_cmds[bcftools_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Filter VCF " + scaff,
            'outfile': filter_file,
            'logfile': cur_logfile,
            'start': False,
            "vcffile": vcf_file
        }

    if globs['dryrun']:
        cmd_num = PC.getCMDNum(globs, len(cmds))
        bcftools_skeleton_cmd = globs[
            'bcftools-path'] + " filter -m+ -e " + globs[
                'filter'] + " -s pseudoit --IndelGap 5 -Oz -o <filtered vcf> <input vcf>"
        cmds[bcftools_skeleton_cmd] = {
            'cmd-num': cmd_num,
            'desc':
            str(globs['num-procs']) + " bcftools filter procs in parallel",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, bcftools_skeleton_cmd, "DRYRUN",
                       bcftools_skeleton_cmd)

    else:
        pool = mp.Pool(processes=globs['filter-procs'])
        for result in pool.starmap(PC.runCMD,
                                   ((bcftools_cmd, globs, cmds, True)
                                    for bcftools_cmd in bcftools_cmds)):
            if result:
                pool.terminate()
                globs['exit-code'] = 1
                PC.endProg(globs)
        pool.terminate()

    return cmds
Beispiel #6
0
def genotypeGVCFs(globs, cmds, cur_ref):
    # Genotype the GVCFs from the last iteration by scaffold.

    gatk_cmds = {}
    for scaff in globs['scaffolds']:
        cur_logfile = os.path.join(
            globs['itervcflogdir'], "gatk-genotypegvcfs- " + scaff + "-iter-" +
            globs['iter-str'] + ".log")
        gvcf_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz")
        vcf_file = os.path.join(
            globs['itervcfscaffdir'],
            scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")

        gatk_cmd = globs[
            'gatk-path'] + " GenotypeGVCFs -R " + cur_ref + " -V " + gvcf_file + " -O " + vcf_file + " --include-non-variant-sites"

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Genotype gVCF " + scaff,
            'outfile': vcf_file,
            'logfile': cur_logfile,
            'start': False
        }
        gatk_cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "Genotype gVCF " + scaff,
            'outfile': vcf_file,
            'logfile': cur_logfile,
            'start': False
        }

    pool = mp.Pool(processes=globs['gvcf-procs'])
    for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True)
                                              for gatk_cmd in gatk_cmds)):
        if exit_flag:
            pool.terminate()
            globs['exit-code'] = 1
            PC.endProg(globs)
    pool.terminate()

    return cmds


# #############################################################################

# def gatherVcfs(vcfdir, cur_ref, globs):
# # Combine the region VCFs from haplotypeCallerMulti.
#     cur_logfile = os.path.join(globs['logdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log");
#     infile_ext = ".vcf.gz";
#     outfile_ext = ".vcf.gz";
#     if globs['iteration'] == globs['num-iters']:
#         infile_ext = "-filtered.vcf.gz";
#         outfile_ext = "-filtered-final.vcf.gz";
#         cur_logfile = cur_logfile.replace(".log", "-final.log");
#     vcf_file = os.path.join(globs['itervcfdir'], "iter-" + globs['iter-str'] + outfile_ext);

#     run_flag = PC.runCheck([vcffile], cur_logfile, globs);

#     if run_flag:
#         params_file = os.path.join(globs['iterdir'], "iter-" + globs['iter-str'] + "-gathervcfs-params.txt");
#         with open(params_file, "w") as paramsfile:
#             for scaff in globs['scaffolds']:
#                 scaff_vcf = os.path.join(vcfdir, scaff + "-iter-" + globs['iter-str'] + infile_ext);
#                 paramsfile.write("-I " + scaff_vcf + "\n");
#         gatk_cmd = globs['gatk-path'] + " GatherVcfs --arguments_file " + params_file + " -O " + vcffile;
#         exit_flag = PC.runCMD(gatk_cmd, "GATK GatherVcfs", cur_logfile, True, globs);
#     else:
#         PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF file already exists", globs['pad'], sep=".") + vcffile + "\n");
#         exit_flag = False;

#     return vcffile, exit_flag;

# #############################################################################

# def indexVCF(vcffile, globs, suffix=""):
# # Index the combined VCF from gatherVcfs.
#     if suffix != "":
#         suffix = "-" + suffix;
#     cur_logfile = os.path.join(globs['iterlogdir'], "vcf-index-iter-" + globs['iter-str'] + suffix + ".log");
#     if globs['iteration'] == globs['num-iters']:
#         cur_logfile = cur_logfile.replace(".log", "-final.log");
#     index_file = vcffile + ".tbi";

#     run_flag = PC.runCheck([index_file], cur_logfile, globs);

#     if run_flag:
#         index_cmd = "tabix -fp vcf " + vcffile;
#         exit_flag = PC.runCMD(index_cmd, "tabix", cur_logfile, True, globs);
#     else:
#         PC.printWrite(globs['logfilename'], globs['log-v'], PC.spacedOut("# VCF index file already exists", globs['pad'], sep=".") + vcffile + "\n");
#         exit_flag = False;

#     return exit_flag;

# #############################################################################
Beispiel #7
0
def haplotypeCaller(globs, cmds, cur_ref, dup_bamfile):
    # Run HaplotypeCaller for each scaffold.

    gatk_cmds = {}
    for scaff in globs['scaffolds']:
        cur_logfile = os.path.join(
            globs['itervcflogdir'], "gatk-haplotypcaller-" + scaff + "-iter-" +
            globs['iter-str'] + ".log")
        if globs['last-iter']:
            vcffile = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz")
        else:
            vcffile = os.path.join(
                globs['itervcfscaffdir'],
                scaff + "-iter-" + globs['iter-str'] + ".vcf.gz")

        gatk_cmd = globs[
            'gatk-path'] + " HaplotypeCaller -R " + cur_ref + " -I " + dup_bamfile + " -L \"" + scaff + "\" -stand-call-conf 30 --native-pair-hmm-threads " + str(
                globs['gatk-t'])
        if globs['last-iter']:
            gatk_cmd += " -ERC GVCF"
        # The final iteration outputs GVCFs to properly emit all sites
        gatk_cmd += " -O " + vcffile

        cmd_num = PC.getCMDNum(globs, len(cmds))
        cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "HaplotypeCaller " + scaff,
            'outfile': vcffile,
            'logfile': cur_logfile,
            'start': False
        }
        gatk_cmds[gatk_cmd] = {
            'cmd-num': cmd_num,
            'desc': "HaplotypeCaller " + scaff,
            'outfile': vcffile,
            'logfile': cur_logfile,
            'start': False
        }

    if globs['dryrun']:
        cmd_num = PC.getCMDNum(globs, len(cmds))
        gatk_skeleton_cmd = globs[
            'gatk-path'] + " HaplotypeCaller -R <reference fasta> -I <BAM file> -L \"<scaffold>\" -stand-call-conf 30 --native-pair-hmm-threads " + str(
                globs['gatk-t'])
        if globs['last-iter']:
            gatk_skeleton_cmd += " -ERC GVCF"
        # The final iteration outputs GVCFs to properly emit all sites
        gatk_skeleton_cmd += " -O <vcf file>"
        cmds[gatk_skeleton_cmd] = {
            'cmd-num': cmd_num,
            'desc':
            str(globs['gatk-procs']) + " HaplotypeCaller procs in parallel",
            'outfile': "",
            'logfile': "",
            'start': False
        }
        PC.report_step(globs, cmds, gatk_skeleton_cmd, "DRYRUN",
                       gatk_skeleton_cmd)

    else:
        pool = mp.Pool(processes=globs['gatk-procs'])
        for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True)
                                                  for gatk_cmd in gatk_cmds)):
            if exit_flag:
                pool.terminate()
                globs['exit-code'] = 1
                PC.endProg(globs)
        pool.terminate()

    return cmds
Beispiel #8
0
if __name__ == '__main__':
    # Main is necessary for multiprocessing to work on Windows.

    globs = GV.init()

    if any(v in sys.argv for v in ["--version", "-version", "--v", "-v"]):
        print("# Pesudo-it version " + globs['version'] + " released on " +
              globs['releasedate'])
        sys.exit(0)
    # The version option to simply print the version and exit.

    print("#")
    print("# " + "=" * 125)
    print(PC.welcome())
    if "-h" not in sys.argv:
        print("       Pseudo assembly by iterative mapping.\n")
    # A welcome banner.

    globs = OP.optParse(globs)
    # Getting the input parameters from optParse.

    if globs['norun']:
        print("# --norun SET. EXITING AFTER PRINTING OPTIONS INFO...\n#")
        sys.exit(0)

    globs = pseudoit(globs)
    PC.endProg(globs)

#############################################################################