def getMask(globs, cmds, vcf_file): # Get the sites to be masked into a bed file. mask_bedfile = os.path.join(globs['iterfadir'], "iter-" + globs['iter-str'] + "-masksites.bed") if globs['diploid']: mask_bedfile = mask_bedfile.replace("-masksites.bed", "-diploid-masksites.bed") cmd = "zgrep \"\./\.\" " + vcf_file + " | awk '{{OFS=\"\t\"; if ($0 !~ /\#/); print $1, $2-1, $2}}' | bedtools merge -i - > " + mask_bedfile cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Get mask sites", 'outfile': mask_bedfile, 'logfile': "", 'start': False } run = True if globs['resume']: if os.path.isfile(mask_bedfile) and os.stat(mask_bedfile).st_size != 0: PC.report_step(globs, cmds, cmd, "RESUME", "previous output found: " + mask_bedfile) run = False if run: if not globs['dryrun']: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) if os.path.isfile( mask_bedfile) and os.stat(mask_bedfile).st_size != 0: num_sites = str(len(open(mask_bedfile, "r").readlines())) PC.report_step(globs, cmds, cmd, "SUCCESS", num_sites + " mask sites read: " + mask_bedfile) else: PC.report_step( globs, cmds, cmd, "ERROR!", "Mask sites file not found or empty: " + mask_bedfile) globs['exit-code'] = 1 PC.endProg(globs) else: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) return mask_bedfile, cmds
def varFilterManual(globs, cmds): # This function is meant for filtering SNPs from the called set for each iteration based on # an inpute VCF provided by -vcf. This implementation first unzips the iteration vcf file, # loops through it and for each SNP checks the set from the input VCF. Then it re-zips the # iteration vcf. This is very slow and meant only for small genomes/numbers of SNPs. # Specifically implemented for SARS-CoV-2 genomes. for scaff in globs['scaffolds']: if globs['dryrun']: PC.report_step(globs, cmds, "NA Filtering variants from input VCF", "DRYRUN", globs['in-vcf']) else: PC.report_step(globs, cmds, "NA Filtering variants from input VCF", "EXECUTING", globs['in-vcf']) filter_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz") filter_file_unzipped = filter_file.replace(".gz", "") os.system("gunzip " + filter_file) vcflines = [ line.strip().split("\t") for line in open(filter_file_unzipped) ] # Unzip and read the iteration VCF file. num_filtered = 0 for i in range(len(vcflines)): if vcflines[i][0].startswith("#"): continue # Check each SNP in the VCF file; skip the header lines. for snp in globs['filter-sites']: if vcflines[i][1] == snp[1] and vcflines[i][4] in snp[4]: if vcflines[i][6] == "PASS": vcflines[i][6] = "pseudoit" elif "pseudoit" not in vcflines[i][6]: vcflines[i][6] += "pseudoit" num_filtered += 1 # Check each SNP in the provided -vcf file. If it matches the current SNP, add the filter string to the # FILTER column. with open(filter_file_unzipped, "w") as new_vcf: for line in vcflines: new_vcf.write("\t".join(line) + "\n") # Re-write the iteration VCF file. os.system("bgzip " + filter_file_unzipped) # Re-compress the iteration VCF file. PC.report_step(globs, cmds, "NA " + str(num_filtered) + " sites filtered", "SUCCESS", globs['in-vcf'])
def getScaffs(cur_fa, globs, cmds, report_status=True): # Save the list of scaffolds/contigs/chromosomes from a FASTA file to a text file. cmd = "grep \">\" " + cur_fa + " | sed 's/>//g'" # > " + globs['scaffs']; # grep the number of scaffolds in the reference... I guess this could also be done by just reading # the number of lines in the index file... cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Get ref scaffold IDs", 'outfile': "", 'logfile': "", 'start': False } # Add the grep command to the global commands dict. if not globs['dryrun']: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) cmd_result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cur_scaffs = list(filter(None, cmd_result.stdout.decode().split("\n"))) globs['scaffolds'] = [ scaff[:scaff.index(" ")] if " " in scaff else scaff for scaff in cur_scaffs ] PC.report_step(globs, cmds, cmd, "SUCCESS", str(len(globs['scaffolds'])) + " scaffold IDs read") else: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) globs['scaffolds'] = [] # Run the grep command and check for errors.. return cmds
def indexCheck(cur_fa, globs, cmds): # Checks that the user has created the proper index files before running the program. ref_ext = PC.detectRefExt(cur_fa, globs) dictfile = cur_fa.replace(ref_ext, ".dict") cmd = "os.path.isfile(" + dictfile + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if not os.path.isfile(dictfile): PC.errorOut( "REF1", "Reference dictionary not found. Please run: picard CreateSequenceDictionary R=<ref>.fa O=<ref>.dict", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found", "") # Check for the reference dictionary file. faidxfile = cur_fa + ".fai" cmd = "os.path.isfile(" + faidxfile + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if not os.path.isfile(faidxfile): PC.errorOut( "REF2", "Reference index (samtools) not found. Please run: samtools faidx <ref>.fa", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found") # Check for the reference faidx file. if globs['mapper'] == "bwa": indexfiles = [ cur_fa + ".amb", cur_fa + ".ann", cur_fa + ".bwt", cur_fa + ".pac", cur_fa + ".sa" ] cmd = "os.path.isfile(" + ",".join(indexfiles) + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if any(not os.path.isfile(f) for f in indexfiles): PC.errorOut( "REF3", "Reference index (bwa) not found. Please run: bwa index <ref>.fa", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index files found") # Check for the bwa index files if --mapper is bwa. elif globs['mapper'] == "hisat2": indexfile = cur_fa + ".1.ht2" cmd = "os.path.isfile(" + indexfile + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Checking ref indices", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) if not os.path.isfile(indexfile): PC.errorOut( "REF3", "Reference index (hisat2) not found. Please run: hisat2-build <ref>.fa <ref>.fa", globs) PC.report_step(globs, cmds, cmd, "SUCCESS", "index file found") # Check for the hisat2 index files if --mapper is hisat2. return cmds
def mergeBam(globs, cmds, bamfiles): # Merge BAM files from different library types. cur_logfile = os.path.join( globs['iterlogdir'], "picard-merge-bam-iter-" + globs['iter-str'] + ".log") merged_bamfile = os.path.join( globs['iterbamdir'], "merged-iter-" + globs['iter-str'] + ".bam.gz") # Get the log file and merged bam file name to output to. if len(bamfiles) > 1: # We only need to run picard if there are multiple bam files from mapping merge_cmd = globs['picard-path'] + " MergeSamFiles " for bamfile in bamfiles: merge_cmd += "I=" + bamfile + " " if globs['tmpdir'] != "System default.": merge_cmd += "TMP_DIR=\"" + globs['tmpdir'] + "\" " if not globs['mkdups']: merge_cmd += "CREATE_INDEX=true " merge_cmd += "USE_THREADING=TRUE VALIDATION_STRINGENCY=LENIENT O=" + merged_bamfile # Generate the MergeSamFiles command. cmds[merge_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Merge BAM files", 'outfile': merged_bamfile, 'logfile': cur_logfile, 'start': False } # Add the MergeSamFiles command to the global cmds dict. exit_flag = PC.runCMD(merge_cmd, globs, cmds, True) PC.exitCheck(exit_flag, globs) # Run the command and check for errors. else: # If there was only one bam file from mapping we don't need to merge, just move it to the expected location. merge_cmd = "mv " + bamfiles[0] + " " + merged_bamfile # Generate the mv command. cmds[merge_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Rename BAM file", 'outfile': merged_bamfile, 'logfile': "", 'start': False } # Add the mv command to the global commands dict. if globs['dryrun']: PC.report_step(globs, cmds, merge_cmd, "DRYRUN") else: PC.report_step(globs, cmds, merge_cmd, "EXECUTING") os.system(merge_cmd) if os.path.isfile(merged_bamfile): PC.report_step(globs, cmds, merge_cmd, "SUCCESS") else: PC.report_step(globs, cmds, merge_cmd, "ERROR") PC.errorOut("PIMAP1", "Error renaming BAM file.", globs) # Run the command and check for errors. return merged_bamfile, cmds
def varFilter(globs, cmds, cur_ref): # Run the command to filter variants from a VCF file based on input filters. Default: "MQ < 30.0 || DP < 5 || DP > 60" bcftools_cmds = {} for scaff in globs['scaffolds']: # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']): # cur_logfile = os.path.join(globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + "-snps.log"); # vcf_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps.vcf.gz"); # filter_file = os.path.join(globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-snps-filter.vcf.gz"); # else: cur_logfile = os.path.join( globs['itervcflogdir'], "bcftools-filter-" + scaff + "-iter-" + globs['iter-str'] + ".log") vcf_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") filter_file = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + "-filter.vcf.gz") bcftools_cmd = globs['bcftools-path'] + " filter -m+ -e " + globs[ 'filter'] + " -s pseudoit --IndelGap 5 -Oz -o " + filter_file + " " + vcf_file cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[bcftools_cmd] = { 'cmd-num': cmd_num, 'desc': "Filter VCF " + scaff, 'outfile': filter_file, 'logfile': cur_logfile, 'start': False, "vcffile": vcf_file } bcftools_cmds[bcftools_cmd] = { 'cmd-num': cmd_num, 'desc': "Filter VCF " + scaff, 'outfile': filter_file, 'logfile': cur_logfile, 'start': False, "vcffile": vcf_file } if globs['dryrun']: cmd_num = PC.getCMDNum(globs, len(cmds)) bcftools_skeleton_cmd = globs[ 'bcftools-path'] + " filter -m+ -e " + globs[ 'filter'] + " -s pseudoit --IndelGap 5 -Oz -o <filtered vcf> <input vcf>" cmds[bcftools_skeleton_cmd] = { 'cmd-num': cmd_num, 'desc': str(globs['num-procs']) + " bcftools filter procs in parallel", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, bcftools_skeleton_cmd, "DRYRUN", bcftools_skeleton_cmd) else: pool = mp.Pool(processes=globs['filter-procs']) for result in pool.starmap(PC.runCMD, ((bcftools_cmd, globs, cmds, True) for bcftools_cmd in bcftools_cmds)): if result: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds
def genConsensus(globs, cmds, vcf_file, cur_ref): # Run the command to generate a consensus FASTA file from the reference and the variants. cmd = "getConsCase()" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Determining case of first base", 'outfile': "", 'logfile': "", 'start': False } bcftools_cmd = globs[ 'bcftools-path'] + " consensus -f " + cur_ref + " -o " + globs[ 'iter-final-fa'] if globs['last-iter'] and globs['indels']: bcftools_cmd += " -c " + globs['iter-final-chain'] if globs['last-iter'] and globs['diploid']: bcftools_cmd += " -I " bcftools_cmd += " -e \"FILTER='pseudoit' || FILTER='IndelGap'\" " + vcf_file cmds[bcftools_cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Generating consensus", 'outfile': globs['iter-final-fa'], 'logfile': globs['iter-consensus-log'], 'start': False } run_flag = True if globs['resume']: run_flag = PC.runCheck(bcftools_cmd, cmds, globs) #### RUN RUNCHECK FIRST first_lower = False if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) first_lower, linestr_orig, linestr_repl = True, "a", "A" elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) first_lower, linestr_orig, linestr_repl = getConsCase(cur_ref) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base: " + linestr_orig[0]) # This first_lower stuff is a hack to deal with bcftools consensus using the case of the first base in the reference fasta to inject variants. # Possibly resolved: https://github.com/samtools/bcftools/issues/1150#issuecomment-582407490 # Need to test and make sure it is in official release before I remove this hack. if first_lower: cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + cur_ref cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Changing first ref base to upper case", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base converted to upper case") # Part of first_lower hack. exit_flag = PC.runCMD(bcftools_cmd, globs, cmds, True) # Consensus command PC.exitCheck(exit_flag, globs) # End the program if an error is encountered if first_lower: cmd = "sed -i '2 s/" + linestr_repl + "/" + linestr_orig + "/g' " + cur_ref cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Reverting case of first ref base", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base reverted to original case") if not globs['dryrun']: first_lower, linestr_orig, linestr_repl = getConsCase( globs['iter-final-fa']) cmd = "sed -i '2 s/" + linestr_orig + "/" + linestr_repl + "/g' " + globs[ 'iter-final-fa'] cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Reverting case of first consensus base", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) elif run_flag: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.system(cmd) PC.report_step(globs, cmds, cmd, "SUCCESS", "First base reverted to original case") # Part of first_lower hack. globs['consensus-file'] = globs['iter-final-fa'] return cmds, globs
def mapping(globs): globs = PC.getIterStr(globs) PC.printWrite( globs['logfilename'], globs['log-v'], "#\n# " + "=" * 51 + " ITERATION " + globs['iter-str'] + " STARTING! " + "=" * 50) globs['iterstarttime'] = PC.report_step(globs, "", start=True) if globs['iteration'] == 1: globs['progstarttime'] = globs['iterstarttime'] cmds = {} cur_ref = PC.getRef(globs) globs['last-iter'] = False if globs['iteration'] == globs['num-iters']: globs['last-iter'] = True # Iteration prep globs['iterdir'] = os.path.join(globs['outdir'], "iter-" + globs['iter-str']) globs['iterbamdir'] = os.path.join(globs['iterdir'], "bam") globs['itervcfdir'] = os.path.join(globs['iterdir'], "vcf") globs['itervcfdir'] = os.path.join(globs['iterdir'], "vcf") globs['iterfadir'] = os.path.join(globs['iterdir'], "fa") globs['iterlogdir'] = os.path.join(globs['iterdir'], "logs") for d in [ globs['iterdir'], globs['iterbamdir'], globs['itervcfdir'], globs['iterfadir'], globs['iterlogdir'] ]: if not os.path.isdir(d): if globs['map-only'] and d not in [ globs['iterbamdir'], globs['iterlogdir'] ]: continue os.makedirs(d) if globs['last-iter']: globs['itervcfscaffdir'] = os.path.join(globs['itervcfdir'], "gvcf-scaff") globs['itervcflogdir'] = os.path.join(globs['itervcfdir'], "gvcf-logs") else: globs['itervcfscaffdir'] = os.path.join(globs['itervcfdir'], "vcf-scaff") globs['itervcflogdir'] = os.path.join(globs['itervcfdir'], "vcf-logs") if not globs['map-only']: for d in [globs['itervcfscaffdir'], globs['itervcflogdir']]: if not os.path.isdir(d): os.makedirs(d) # Make directories for current iteration if globs['bam'] and globs['iteration'] == 1: globs['iter-final-bam-log'] = "NA" globs['iter-final-bam'] = globs['bam'] else: if globs['mkdups']: globs['iter-final-bam-log'] = os.path.join( globs['iterlogdir'], "picard-mkdup-iter-" + globs['iter-str'] + ".log") globs['iter-final-bam'] = os.path.join( globs['iterbamdir'], "merged-rg-mkdup-iter-" + globs['iter-str'] + ".bam.gz") else: globs['iter-final-bam-log'] = os.path.join( globs['iterlogdir'], "picard-merge-bam-iter-" + globs['iter-str'] + ".log") globs['iter-final-bam'] = os.path.join( globs['iterbamdir'], "merged-iter-" + globs['iter-str'] + ".bam.gz") # If --nomkdup is set, the final BAM file for each iteration should not have the mkdup suffix. # Final BAM file for this iteration if globs['last-iter']: globs['iter-gather-vcf-log'] = os.path.join( globs['iterlogdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + ".log") globs['iter-gather-vcf'] = os.path.join( globs['itervcfdir'], "iter-" + globs['iter-str'] + "-filter.vcf.gz") if globs['indels']: globs['iter-final-vcf-log'] = globs['iter-gather-vcf-log'] globs['iter-final-vcf'] = globs['iter-gather-vcf'] else: globs['iter-final-vcf-log'] = os.path.join( globs['iterlogdir'], "gatk-selectsnps-iter-" + globs['iter-str'] + ".log") globs['iter-final-vcf'] = os.path.join( globs['itervcfdir'], "iter-" + globs['iter-str'] + "-filter-snps.vcf.gz") else: globs['iter-gather-vcf-log'] = os.path.join( globs['iterlogdir'], "gatk-gathervcfs-iter-" + globs['iter-str'] + "-intermediate.log") globs['iter-gather-vcf'] = os.path.join( globs['itervcfdir'], "iter-" + globs['iter-str'] + "-filter-intermediate.vcf.gz") globs['iter-final-vcf-log'] = os.path.join( globs['iterlogdir'], "gatk-selectsnps-iter-" + globs['iter-str'] + "-intermediate.log") globs['iter-final-vcf'] = os.path.join( globs['itervcfdir'], "iter-" + globs['iter-str'] + "-filter-intermediate-snps.vcf.gz") # Final VCF file for this iteration if globs['last-iter']: if globs['indels']: globs['iter-consensus-log'] = os.path.join( globs['iterlogdir'], "bcftools-consensus-iter-" + globs['iter-str'] + "-final.log") globs['iter-final-chain'] = os.path.join( globs['iterfadir'], "iter-" + globs['iter-str'] + "-final.chain") globs['iter-final-fa'] = os.path.join( globs['iterfadir'], "iter-" + globs['iter-str'] + "-final.fa") else: globs['iter-consensus-log'] = os.path.join( globs['iterlogdir'], "bcftools-consensus-iter-" + globs['iter-str'] + "-snps-final.log") globs['iter-final-chain'] = os.path.join( globs['iterfadir'], "iter-" + globs['iter-str'] + "-snps-final.chain") globs['iter-final-fa'] = os.path.join( globs['iterfadir'], "iter-" + globs['iter-str'] + "-snps-final.fa") if globs['diploid']: globs['iter-consensus-log'] = globs['iter-consensus-log'].replace( "-final.log", "-diploid-final.log") globs['iter-final-chain'] = globs['iter-final-chain'].replace( "-final.chain", "-diploid-final.chain") globs['iter-final-fa'] = globs['iter-final-fa'].replace( "-final.fa", "-diploid-final.fa") else: globs['iter-consensus-log'] = os.path.join( globs['iterlogdir'], "bcftools-consensus-iter-" + globs['iter-str'] + "-snps-intermediate.log") globs['iter-final-chain'] = os.path.join( globs['iterfadir'], "iter-" + globs['iter-str'] + "-snps-intermediate.chain") globs['iter-final-fa'] = os.path.join( globs['iterfadir'], "iter-" + globs['iter-str'] + "-snps-intermediate.fa") # Final FASTA files for this iteration # Output files for current iteration if globs['iteration'] == 1: cmds = piref.indexCheck(globs['ref'], globs, cmds) cmds = piref.getScaffs(globs['ref'], globs, cmds) # Check that all index files have been created and get the scaffold IDs from the reference FASTA if globs['bam'] and globs['iteration'] == 1: do_mapping = False elif globs['resume']: do_mapping = PC.prevCheck(globs['iter-final-bam'], globs['iter-final-bam-log'], globs) else: do_mapping = True if do_mapping: do_varcalling = True elif globs['resume']: do_varcalling = PC.prevCheck(globs['iter-final-vcf'], globs['iter-final-vcf-log'], globs) else: do_varcalling = True if do_varcalling: do_consensus = True elif globs['resume']: do_consensus = PC.prevCheck(globs['iter-final-fa'], globs['iter-consensus-log'], globs) else: do_consensus = True # CHECK WHICH STEPS WE NEED TO PERFORM statstr = "EXECUTING" if globs['resume']: statstr = "RESUME" if globs['dryrun']: statstr = "DRYRUN" # Status for the main step reports if globs['iteration'] != 1: cmds = piref.indexFa(globs, cmds, cur_ref) else: if globs['in-vcf']: PC.report_step( globs, cmds, "NA--00 Reading input VCF", statstr, "Reading input SNPs to ignore during variant calling.") if not globs['dryrun']: globs['filter-sites'] = PC.readVCF(globs['in-vcf']) # If a VCF file has been provided to filter SNPs with -vcf, read those SNPs here. This is slow, meant for a small number of SNPs # in small genomes only. # INDEX FASTA IF NOT FIRST ITERATION if do_mapping: PC.report_step(globs, cmds, "NA--01 Read mapping", statstr, "Mapping reads and post-processing.") PC.report_step(globs, cmds, "NA--01 Read mapping", statstr, "Getting read groups.") pimap.getRG(globs) if globs['mapper'] == "bwa": bamfiles, cmds = pimap.BWA(globs, cmds, cur_ref) # If --mapper is bwa if globs['mapper'] == "hisat2": bamfiles, cmds = pimap.hisat2(globs, cmds, cur_ref) # If --mapper is hisat2 # READ MAPPING #rg_bamfile, cmds = varprep.addRG(globs, cmds, bamfiles); # ADD READ GROUPS merged_bamfile, cmds = pimap.mergeBam(globs, cmds, bamfiles) # MERGE BAM FILES also sorts if globs['mkdups']: cmds = pimap.markDups(globs, cmds, merged_bamfile) # MARK DUPLICATES elif globs['bam'] and globs['iteration'] == 1: PC.report_step( globs, cmds, "NA--01 Read mapping", "BAM", "initial BAM file provided, skipping all mapping steps: " + globs['iter-final-bam']) else: PC.report_step( globs, cmds, "NA--01 Read mapping", "RESUME", "previous processed BAM file found, skipping all mapping steps: " + globs['iter-final-bam']) ## READ MAPPING STEPS #cmds = varprep.indexBAM(globs, cmds); # INDEX BAM FILE # Now done during MarkDuplicates if globs['map-only']: PC.report_step( globs, cmds, "NA--04 Iteration cleanup", statstr, "Removing intermediate files based on --keep* options.") cmds = cleanUp(globs, cmds) PC.printWrite( globs['logfilename'], globs['log-v'], "#\n# " + "=" * 51 + " ITERATION " + globs['iter-str'] + " COMPLETE! " + "=" * 50) PC.report_step(globs, "", end=True) globs['iteration'] += 1 return globs # This stops the program after the first iteration of mapping if --maponly is set. if do_varcalling: PC.report_step(globs, cmds, "NA--02 Variant calling", statstr, "Calling and post-processing variants.") cmds = varcall.haplotypeCaller(globs, cmds, cur_ref, globs['iter-final-bam']) # HAPLOTYPECALLER if globs['last-iter']: cmds = varcall.genotypeGVCFs(globs, cmds, cur_ref) # GENOTYPE GVCFS FOR LAST ITER # if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']): # cmds = varpost.selectSNPs(globs, cmds); # SELECT SNPs if it is not the last iteration, or if it is and the final output should not contain indels cmds = varpost.varFilter(globs, cmds, cur_ref) # FILTER VCFs if globs['in-vcf']: varpost.varFilterManual(globs, cmds) # If a vcf file has been provided for filtering SNPs with -vcf, do that filtering here. cmds = varpost.gatherVCFs(globs, cmds) # COMBINE VCF cmds = varpost.indexVCF(globs, cmds, globs['iter-gather-vcf']) # INDEX VCF else: PC.report_step( globs, cmds, "NA--02 Variant calling", "RESUME", "previous processed VCF file found, skipping all variant calling steps: " + globs['iter-final-vcf']) ## VARIANT CALLING STEPS if do_consensus: PC.report_step(globs, cmds, "NA--03 Consensus generation", statstr, "Generating consensus FASTA.") if globs['last-iter']: mask_bedfile, cmds = con.getMask(globs, cmds, globs['iter-gather-vcf']) # GET MASK SITES cur_ref, cmds = con.maskFa(globs, cmds, mask_bedfile, cur_ref) # MASK PREVIOUS REFERENCE if not globs['last-iter'] or (globs['last-iter'] and not globs['indels']): cmds = varpost.selectSNPs(globs, cmds, globs['iter-gather-vcf']) # SELECT SNPs FROM VCF IF IT IS NOT THE LAST ITERATION OR IF --noindels IS SET cmds = varpost.indexVCF(globs, cmds, globs['iter-final-vcf']) # INDEX FINAL VCF cmds, globs = con.genConsensus(globs, cmds, globs['iter-final-vcf'], cur_ref) # GENERATE CONSENSUS else: PC.report_step( globs, cmds, "NA--03 Consensus generation", "RESUME", "previous processed consensus FASTA file found, skipping all consensus generation steps: " + globs['iter-final-fa']) globs['consensus-file'] = globs['iter-final-fa'] ## CONSENSUS STEPS PC.report_step(globs, cmds, "NA--04 Iteration cleanup", statstr, "Removing intermediate files based on --keep* options.") cmds = cleanUp(globs, cmds) PC.printWrite( globs['logfilename'], globs['log-v'], "#\n# " + "=" * 51 + " ITERATION " + globs['iter-str'] + " COMPLETE! " + "=" * 50) PC.report_step(globs, "", end=True) globs['iteration'] += 1 return globs
def cleanUp(globs, cmds): i = globs['iter-str'] prev_i = str(int(i) - 1) if len(prev_i) == 1: prev_iter = "0" + prev_i possible_map_files = { 'iter-' + i + "-dupmets.txt": 2, "merged-iter-" + i + ".bam.gz": 2, "merged-rg-iter-" + i + ".bam.gz": 2, "merged-rg-mkdup-iter-" + i + ".bam.gz": 1, "merged-rg-mkdup-iter-" + i + ".bam.gz.bai": 1, "pe-iter-" + i + ".bam.gz": 2, "pem-iter-" + i + ".bam.gz": 2, "se-iter-" + i + ".bam.gz": 2 } possible_vcf_files = { "vcf-scaff": 2, "gvcf-scaff": 2, "iter-" + i + "-filter-intermediate.vcf.gz": 2, "iter-" + i + "-filter-intermediate.vcf.gz.tbi": 2, "iter-" + i + "-filter-intermediate-snps.vcf.gz": 1, "iter-" + i + "-filter-intermediate-snps.vcf.gz.tbi": 1, "iter-" + i + "-gathervcfs-params.txt": 2, "iter-" + i + "-filter.vcf.gz": 1, "iter-" + i + "-filter.vcf.gz.tbi": 1, "iter-" + i + "-filter-snps.vcf.gz": 1, "iter-" + i + "-filter-snps.vcf.gz.tbi": 1 } possible_fa_files = [ "iter-" + prev_i + "-masked.fa", "iter-" + prev_i + "snps-masked.fa", "iter-" + i + "-snps-intermediate.dict", "iter-" + i + "-snps-intermediate.fa", "iter-" + i + "-snps-intermediate.fa.amb", "iter-" + i + "-snps-intermediate.fa.ann", "iter-" + i + "-snps-intermediate.fa.bwt", "iter-" + i + "-snps-intermediate.fa.fai", "iter-" + i + "-snps-intermediate.fa.pac", "iter-" + i + "-snps-intermediate.fa.sa" ] if globs['last-iter'] and globs['keeplevel'] == 0: globs['keeplevel'] = 1 for f in possible_map_files: if possible_map_files[f] > globs['keeplevel']: full_f = os.path.join(globs['iterbamdir'], f) if os.path.isfile(full_f): cmd = "os.remove(" + full_f + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Removing file", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) else: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.remove(full_f) for f in possible_vcf_files: if possible_vcf_files[f] > globs['keeplevel']: full_f = os.path.join(globs['itervcfdir'], f) if os.path.isfile(full_f): cmd = "os.remove(" + full_f + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Removing file", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) else: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) os.remove(full_f) elif os.path.isdir(full_f): cmd = "shutil.rmtree(" + full_f + ")" cmds[cmd] = { 'cmd-num': PC.getCMDNum(globs, len(cmds)), 'desc': "Removing directory", 'outfile': "", 'logfile': "", 'start': False } if globs['dryrun']: PC.report_step(globs, cmds, cmd, "DRYRUN", cmd) else: PC.report_step(globs, cmds, cmd, "EXECUTING", cmd) shutil.rmtree(full_f) return cmds
def haplotypeCaller(globs, cmds, cur_ref, dup_bamfile): # Run HaplotypeCaller for each scaffold. gatk_cmds = {} for scaff in globs['scaffolds']: cur_logfile = os.path.join( globs['itervcflogdir'], "gatk-haplotypcaller-" + scaff + "-iter-" + globs['iter-str'] + ".log") if globs['last-iter']: vcffile = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".gvcf.gz") else: vcffile = os.path.join( globs['itervcfscaffdir'], scaff + "-iter-" + globs['iter-str'] + ".vcf.gz") gatk_cmd = globs[ 'gatk-path'] + " HaplotypeCaller -R " + cur_ref + " -I " + dup_bamfile + " -L \"" + scaff + "\" -stand-call-conf 30 --native-pair-hmm-threads " + str( globs['gatk-t']) if globs['last-iter']: gatk_cmd += " -ERC GVCF" # The final iteration outputs GVCFs to properly emit all sites gatk_cmd += " -O " + vcffile cmd_num = PC.getCMDNum(globs, len(cmds)) cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "HaplotypeCaller " + scaff, 'outfile': vcffile, 'logfile': cur_logfile, 'start': False } gatk_cmds[gatk_cmd] = { 'cmd-num': cmd_num, 'desc': "HaplotypeCaller " + scaff, 'outfile': vcffile, 'logfile': cur_logfile, 'start': False } if globs['dryrun']: cmd_num = PC.getCMDNum(globs, len(cmds)) gatk_skeleton_cmd = globs[ 'gatk-path'] + " HaplotypeCaller -R <reference fasta> -I <BAM file> -L \"<scaffold>\" -stand-call-conf 30 --native-pair-hmm-threads " + str( globs['gatk-t']) if globs['last-iter']: gatk_skeleton_cmd += " -ERC GVCF" # The final iteration outputs GVCFs to properly emit all sites gatk_skeleton_cmd += " -O <vcf file>" cmds[gatk_skeleton_cmd] = { 'cmd-num': cmd_num, 'desc': str(globs['gatk-procs']) + " HaplotypeCaller procs in parallel", 'outfile': "", 'logfile': "", 'start': False } PC.report_step(globs, cmds, gatk_skeleton_cmd, "DRYRUN", gatk_skeleton_cmd) else: pool = mp.Pool(processes=globs['gatk-procs']) for exit_flag in pool.starmap(PC.runCMD, ((gatk_cmd, globs, cmds, True) for gatk_cmd in gatk_cmds)): if exit_flag: pool.terminate() globs['exit-code'] = 1 PC.endProg(globs) pool.terminate() return cmds