def submit_qsub(qsub_opt, qsub_cmd, qsub_name, output_path, log): job_id = 0 job_share = 150 cluster_project = "gentech-rqc.p" qsub_path = os.path.join(output_path, "qsub") if not os.path.isdir(qsub_path): os.makedirs(qsub_path) output_log = os.path.join(qsub_path, "qsub-%s.log" % qsub_name) my_qsub = "qsub -b y -j y -m n -w e -terse -N %s -P %s -o %s -js %s %s '%s'" % ( qsub_name, cluster_project, output_log, job_share, qsub_opt, qsub_cmd) # append to qsub.txt cmd = "module load uge;%s" % my_qsub stdOut, stdErr, exitCode = run_sh_command(cmd, True, log) post_mortem_cmd(cmd, exitCode, stdOut, stdErr, log) if exitCode == 0: job_id = int(stdOut.strip()) log.info("- cluster job id: %s", job_id) if job_id > 0: qsub_log = os.path.join(output_path, "qsub_list.txt") fh = open(qsub_log, "a") fh.write("%s,%s\n" % (job_id, "submitted")) fh.close() return job_id
def read_qc(odir, fastq, status): log_and_print( "\n\n%s - RUN QC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n" % (color['pink'], color[''])) if not os.path.isfile(fastq): return False, status qcdir = os.path.join(odir, READQC_DIR) if STEP_ORDER[status] < STEP_ORDER[QC_END]: checkpoint(QC_START, status) qcexe = os.path.join(os.path.dirname(__file__), 'readqc.py') cmd = '%s -o %s -f %s --skip-blast' % (qcexe, qcdir, fastq) # print('DEBUG : %s' % cmd) stdOut, stdErr, exitCode = run_sh_command(cmd, True) if exitCode != 0: print('ERROR : %s' % stdErr) return False, qcdir, status checkpoint(QC_END, status) status = QC_END else: log_and_print("No need to do qc step.") return True, qcdir, status
def localize_file(fileNameFullPath, log): done = 0 loopCount = 0 while done == 0: loopCount += 1 if os.path.isfile(fileNameFullPath): done = 1 # congratulations! The file system seems to be working for the moment else: sleepTime = 60 * loopCount log.error( "Filename doesn't exist on the system: %s, sleeping for %s seconds.", fileNameFullPath, sleepTime) time.sleep(sleepTime) if loopCount > 3: done = 1 # probably failed if not os.path.isfile(fileNameFullPath): return None destinationPath = "/scratch/rqc/localized-file" if not os.path.isdir(destinationPath): # make_dir_p(destinationPath) _, _, exitCode = run_sh_command( "mkdir -p /scratch/rqc/localized-file && chmod 777 /scratch/rqc/localized-file", True, log, True) assert exitCode == 0 fileName = safe_basename(fileNameFullPath, log)[0] localizeCmd = "rsync -av --omit-dir-times --no-perms " + fileNameFullPath + " " + os.path.join( destinationPath, fileName) _, _, exitCode = run_sh_command(localizeCmd, True, log, True) localizedFileNameFullPath = None if exitCode == RQCExitCodes.JGI_SUCCESS: localizedFileNameFullPath = os.path.join(destinationPath, fileName) log.info("File localization completed for %s" % (fileName)) else: localizedFileNameFullPath = None log.error("File localization failed for %s" % (fileName)) return localizedFileNameFullPath
def watch_cluster(output_path, log, sleep_time=300): log.info("watch_cluster") is_job_complete = "/usr/common/usg/bin/isjobcomplete.new" # + job_id = nice way to ask if job is done qsub_log = os.path.join(output_path, "qsub_list.txt") #sleep_time = 300 # check isjobrunning every xx seconds hb_max = ( 180 * 3600 ) / sleep_time # total number of heartbeats before we give up, 180 hours worth of run time #hb_max = 5 # temp done = 0 hb_cnt = 0 # heartbeat count if not os.path.isfile(qsub_log): done = 1 while done == 0: hb_cnt += 1 log.info("- heartbeat: %s", hb_cnt) qsub_list = [] qsub_cnt = 0 qsub_complete = 0 qsub_err = 0 fh = open(qsub_log, "r") for line in fh: qsub_list.append(line.strip()) qsub_cnt += 1 fh.close() fh = open(qsub_log, "w") for qsub in qsub_list: job_id, status = qsub.split(",") new_status = status if status in ("complete", "fail"): continue else: cmd = "%s %s" % (is_job_complete, job_id) stdOut, stdErr, exitCode = run_sh_command(cmd, True, log) #post_mortem_cmd(cmd, exitCode, stdOut, stdErr, log) running = "%s queued/running" % job_id not_running = "%s not queued/running" % job_id error_qw = "%s queued/running/error" % job_id if stdOut.strip == running: new_status = "running" elif stdOut.strip() == not_running: new_status = "complete" # might have failed qsub_complete += 1 elif stdOut.strip() == error_qw: new_status = "error" qsub_err += 1 if stdOut.strip() == "not queued/running": new_status = "complete" fh.write("%s,%s\n" % (job_id, new_status)) log.info("- job_id: %s, status: %s", job_id, new_status) fh.close() qsub_running = qsub_cnt - (qsub_complete + qsub_err) log.info("- job count: %s, running: %s, err: %s", qsub_cnt, qsub_running, qsub_err) if qsub_cnt == (qsub_complete + qsub_err): done = 1 if hb_cnt >= hb_max: done = 1 if done == 0: time.sleep(sleep_time)
def localize_dir2(db, log, bbuffer=None): safeBaseName, exitCode = safe_basename(db, log) safeDirName, exitCode = safe_dirname(db, log) targetScratchDir = None nerscDb = "/scratch/blastdb/global/dna/shared/rqc/ref_databases/ncbi/CURRENT" # nerscDb = "/" ## temporarily do not use NERSC db ## nerscDbPersistentBBuffer = "/var/opt/cray/dws/mounts/batch/NCBI_DB_striped_scratch/ncbi" ## check if persistent burst buffer is ready if bbuffer is None and "DW_PERSISTENT_STRIPED_NCBI_DB" in os.environ and os.environ[ 'DW_PERSISTENT_STRIPED_NCBI_DB'] is not None: nerscDb = os.path.join(os.environ['DW_PERSISTENT_STRIPED_NCBI_DB'], "ncbi") else: targetScratchDir = "/scratch/rqc" if bbuffer is not None: if not os.path.isdir(bbuffer): log.error("Burst Buffer does not initiated: %s", bbuffer) return None, RQCExitCodes.JGI_FAILURE else: targetScratchDir = bbuffer log.info("Localization will use Burst Buffer location at %s", targetScratchDir) elif not os.path.isdir(targetScratchDir): _, _, exitCode = run_sh_command( "mkdir %s && chmod 777 %s" % (targetScratchDir, targetScratchDir), True, log, True) assert exitCode == 0 rsyncOption = "" src = db dest = "" ## For # GREEN_GENES = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/green_genes16s.insa_gg16S.fasta" # LSU_REF = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/LSURef_115_tax_silva.fasta" # SSU_REF = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/SSURef_NR99_115_tax_silva.fasta" # LSSU_REF = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/LSSURef_115_tax_silva.fasta" # CONTAMINANTS = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/JGIContaminants.fa" # COLLAB16S = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/collab16s.fa" if db.endswith(".fa") or db.endswith(".fasta"): rsyncOption = "--include '*.n??' --exclude '*.fa' --exclude '*.fasta' --exclude '%s' --exclude '*.log'" % ( safeBaseName) src = db + ".n??" dest = os.path.join(targetScratchDir, safeBaseName) blastDb = dest if os.path.isfile( dest): ## just in case for the file name already exists rmCmd = "rm -rf %s" % (dest) run_sh_command(rmCmd, True, log, True) ## For # NT_maskedYindexedN_BB = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/nt/bbtools_dedupe_mask/nt_bbdedupe_bbmasked_formatted" elif db.endswith("nt_bbdedupe_bbmasked_formatted"): if os.path.isdir(os.path.join(nerscDb, "nt/bbtools_dedupe_mask")): blastDb = os.path.join(nerscDb, "nt/bbtools_dedupe_mask") log.info("NERSC NCBI Database found: %s" % (blastDb)) return blastDb, RQCExitCodes.JGI_SUCCESS rsyncOption = "--include '*.n??' --exclude '*.fna' --exclude 'nt_bbdedupe_bbmasked_formatted' --exclude '*.log'" src = safeDirName + '/' dest = os.path.join(targetScratchDir, safeBaseName) blastDb = dest ## For # NR = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/nr/nr" # REFSEQ_ARCHAEA = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.archaea/refseq.archaea" # REFSEQ_BACTERIA = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.bacteria/refseq.bacteria" # REFSEQ_FUNGI = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.fungi/refseq.fungi" # REFSEQ_MITOCHONDRION = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.mitochondrion/refseq.mitochondrion" # REFSEQ_PLANT = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.plant/refseq.plant" # REFSEQ_PLASMID = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.plasmid/refseq.plasmid" # REFSEQ_PLASTID = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.plastid/refseq.plastid" # REFSEQ_VIRAL = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.viral/refseq.viral" else: if os.path.isdir(os.path.join(nerscDb, safeBaseName)): blastDb = os.path.join(nerscDb, safeBaseName) log.info("NERSC NCBI Database found: %s" % (blastDb)) return blastDb, RQCExitCodes.JGI_SUCCESS rsyncOption = "--include '*.n??' --exclude '*.fna' --exclude '%s' --exclude '*.log'" % ( safeBaseName) src = safeDirName + '/' dest = os.path.join(targetScratchDir, safeBaseName) blastDb = os.path.join(dest, dest) rsyncCmd = "rsync -av --omit-dir-times --no-perms %s %s %s" % (rsyncOption, src, dest) dbDirNameLocalized, _, exitCode = run_sh_command(rsyncCmd, True, log, True) if exitCode != 0: log.error("rsync failed. Cannot localize " + str(db)) return dbDirNameLocalized, RQCExitCodes.JGI_FAILURE else: cmd = "chmod -f -R 777 %s" % (dest) _, _, exitCode = run_sh_command(cmd, True, log) return blastDb, RQCExitCodes.JGI_SUCCESS
def read_megablast_hits(db, log): currentDir = RQCReadQcConfig.CFG["output_path"] megablastDir = "megablast" megablastPath = os.path.join(currentDir, megablastDir) statsFile = RQCReadQcConfig.CFG["stats_file"] filesFile = RQCReadQcConfig.CFG["files_file"] ## ## Process blast output files ## matchings = 0 hitCount = 0 parsedFile = os.path.join(megablastPath, "megablast.*.%s*.parsed" % (db)) matchings, _, exitCode = run_sh_command( "grep -v '^#' %s 2>/dev/null | wc -l " % (parsedFile), True, log) if exitCode == 0: ## if parsed file found. t = matchings.split() if len(t) == 1 and t[0].isdigit(): hitCount = int(t[0]) append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_MATCHING_HITS + " " + db, hitCount, log) ## ## add .parsed file ## parsedFileFound, _, exitCode = run_sh_command("ls %s" % (parsedFile), True, log) if parsedFileFound: parsedFileFound = parsedFileFound.strip() append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_PARSED_FILE + " " + db, os.path.join(megablastPath, parsedFileFound), log) else: log.error("- Failed to add megablast parsed file of %s." % (db)) return RQCExitCodes.JGI_FAILURE ## ## wc the top hits ## topHit = 0 tophitFile = os.path.join(megablastPath, "megablast.*.%s*.parsed.tophit" % (db)) tophits, _, exitCode = run_sh_command( "grep -v '^#' %s 2>/dev/null | wc -l " % (tophitFile), True, log) t = tophits.split() if len(t) == 1 and t[0].isdigit(): topHit = int(t[0]) append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_TOP_HITS + " " + db, topHit, log) ## ## wc the taxonomic species ## spe = 0 taxlistFile = os.path.join(megablastPath, "megablast.*.%s*.parsed.taxlist" % (db)) species, _, exitCode = run_sh_command( "grep -v '^#' %s 2>/dev/null | wc -l " % (taxlistFile), True, log) t = species.split() if len(t) == 1 and t[0].isdigit(): spe = int(t[0]) append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_TAX_SPECIES + " " + db, spe, log) ## ## wc the top 100 hit ## top100hits = 0 top100hitFile = os.path.join(megablastPath, "megablast.*.%s*.parsed.top100hit" % (db)) species, _, exitCode = run_sh_command( "grep -v '^#' %s 2>/dev/null | wc -l " % (top100hitFile), True, log) t = species.split() if len(t) == 1 and t[0].isdigit(): top100hits = int(t[0]) append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_TOP_100HITS + " " + db, top100hits, log) ## ## Find and add taxlist file ## taxListFound, _, exitCode = run_sh_command("ls %s" % (taxlistFile), True, log) taxListFound = taxListFound.strip() if taxListFound: append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_TAXLIST_FILE + " " + db, os.path.join(megablastPath, taxListFound), log) else: log.error("- Failed to add megablast taxlist file of %s." % (db)) return RQCExitCodes.JGI_FAILURE ## ## Find and add tophit file ## tophitFound, _, exitCode = run_sh_command("ls %s" % (tophitFile), True, log) tophitFound = tophitFound.strip() if tophitFound: append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_TOPHIT_FILE + " " + db, os.path.join(megablastPath, tophitFound), log) else: log.error("- Failed to add megablast tophit file of %s." % (db)) return RQCExitCodes.JGI_FAILURE ## ## Find and add top100hit file ## top100hitFound, _, exitCode = run_sh_command("ls %s" % (top100hitFile), True, log) top100hitFound = top100hitFound.strip() if top100hitFound: append_rqc_file( filesFile, ReadqcStats.ILLUMINA_READ_TOP100HIT_FILE + " " + db, os.path.join(megablastPath, top100hitFound), log) else: log.error("- Failed to add megablast top100hit file of %s." % (db)) return RQCExitCodes.JGI_FAILURE else: log.info("- No blast hits for %s." % (db)) return RQCExitCodes.JGI_SUCCESS
def post_process(fastq, outDir, filteredFastq, status, log): ## obtain read counts from input and filtered fastq files and save the values to STATS_LIST_FILE_NAME file; ## compress the filtered fastq file log_and_print( "\n\n%s - RUN POST PROCESS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n" % (color['pink'], color[''])) if STEP_ORDER[status] < STEP_ORDER[POST_END]: checkpoint(POST_START, status) rawCnt = 0 rawBaseCnt = 0 newCnt = 0 newBaseCnt = 0 stats = get_dict_obj(BB_STATS_LIST_FILE_NAME) rawCnt = pipeline_val('inputReads', { 'type': 'int', 'vtype': 'numeric' }, stats) rawBaseCnt = pipeline_val('inputBases', { 'type': 'int', 'vtype': 'numeric' }, stats) newCnt = pipeline_val('outputReads', { 'type': 'int', 'vtype': 'numeric' }, stats) newBaseCnt = pipeline_val('outputBases', { 'type': 'int', 'vtype': 'numeric' }, stats) readCounts = {} readRmPct = 100.0 * ((rawCnt - newCnt) / float(rawCnt)) baseRmPct = 100.0 * ((rawBaseCnt - newBaseCnt) / float(rawBaseCnt)) readCounts['readRmPct'] = '%.3f' % readRmPct readCounts['baseRmPct'] = '%.3f' % baseRmPct refStats = {} filterLogStat = {} cardinality = None bbdukVersion = None bbmapVersion = None if os.path.isfile("filter.log"): with open(os.path.join(outDir, "filter.log"), "r") as FLFH: isContamNumChecked = False ## Contamination will be done twice for removeribo or for MTF isKtrimmedTotalRemovedNumChecked = False ## for parsing "Total Removed" after ktrimming for l in FLFH: if l.startswith("Input:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 2 if 'adaptertriminput' not in filterLogStat: filterLogStat["adaptertriminput"] = { "numreads": toks[0], "numbases": toks[1] } elif 'contamtriminput' not in filterLogStat: filterLogStat["contamtriminput"] = { "numreads": toks[0], "numbases": toks[1] } elif l.startswith("FTrimmed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ftrimmed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("KTrimmed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ktrimmed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } isKtrimmedTotalRemovedNumChecked = True ## RQCSUPPORT-1987 elif l.startswith("Total Removed:" ) and isKtrimmedTotalRemovedNumChecked: toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ktrimmed_total_removed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } isKtrimmedTotalRemovedNumChecked = False elif l.startswith("Trimmed by overlap:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["trimmedbyoverlap"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Result:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 if 'adaptertrimresult' not in filterLogStat: filterLogStat["adaptertrimresult"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif 'contamtrimresult' not in filterLogStat: filterLogStat["contamtrimresult"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Unique 31-mers:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 2 or len(toks) == 1 if 'adaptertrimunique31mers' not in filterLogStat: if len(toks) == 2: filterLogStat["adaptertrimunique31mers"] = { "num": toks[1] } else: filterLogStat["adaptertrimunique31mers"] = { "num": "0" } else: if len(toks) == 2: filterLogStat["contamtrimunique31mers"] = { "num": toks[1] } else: filterLogStat["contamtrimunique31mers"] = { "num": "0" } elif not isContamNumChecked and l.startswith( "Contaminants:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["contaminants"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } isContamNumChecked = True elif l.startswith("QTrimmed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["qtrimmed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Short Read Discards:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["shortreaddiscards"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Low quality discards:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["lowqualitydiscards"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("BBDuk version"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 1 bbdukVersion = toks[0] elif l.startswith("BBMap version"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 1 bbmapVersion = toks[0] ## BBDuk 36.12 06272016 elif l.startswith("Adapter Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["adaptersequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Synthetic Contam Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["syntheticcontamsequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## 08112016 elif l.startswith( "Short Synthetic Contam Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat[ "shortsyntheticcontamsequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Ribosomal Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ribosomalsequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## BBMap 36.12 06272016 elif l.startswith("Human Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["humansequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## RQC-862, RQC-880 elif l.startswith("Microbial Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["microbialremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## ## refStats.txt format ## ## name %unambiguousReads unambiguousMB %ambiguousReads ambiguousMB unambiguousReads ambiguousReads ## human_masked 85.24693 498.92052 0.09378 0.55290 3350692 3686 ## mouse_masked 0.03765 0.21670 0.10802 0.63690 1480 4246 ## cat_masked 0.01862 0.09568 0.02514 0.14820 732 988 ## dog_masked 0.00697 0.03815 0.01384 0.08160 274 544 ## if os.path.isfile("refStats.txt"): refStatsFile = os.path.join(outDir, "refStats.txt") with open(refStatsFile) as RFH: ## Need to report 0 if nothing matched refStats['human'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } refStats['cat'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } refStats['dog'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } refStats['mouse'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } for l in RFH: if l: if l.startswith("#"): continue toks = l.rstrip().split() assert len(toks) == 7 ## the number and percent of reads that map unambiguously or ambiguously to human, cat, dog. ## take the sum of the two numbers (ambiguous plus unambiguous) to use as the final percentage. if l.startswith("human"): refStats['human'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } if l.startswith("cat"): refStats['cat'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } if l.startswith("dog"): refStats['dog'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } if l.startswith("mouse"): refStats['mouse'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } log.debug("refStats.txt: %s", str(refStats)) ########################################################### log_and_print("Write to stats file %s" % STATS_LIST_FILE_NAME) ########################################################### if os.path.isfile(STATS_LIST_FILE_NAME): os.remove(STATS_LIST_FILE_NAME) with open(BB_STATS_LIST_FILE_NAME) as bbfh: with open(STATS_LIST_FILE_NAME, 'a') as fh: for line in bbfh: if not line.startswith("#") and line.strip(): fh.write(line) bbtoolsVersion = None stats = get_dict_obj(STATS_LIST_FILE_NAME) with open(STATS_LIST_FILE_NAME, 'a') as fh: for key in readCounts: if key not in stats: write_stats(fh, key, readCounts[key], log) for key in refStats: for k in refStats[key]: write_stats(fh, key + '_' + k, refStats[key][k], log) write_stats(fh, "cardinality", cardinality, log) ## Write refStats to filterStats.txt file for key in filterLogStat: for k in filterLogStat[key]: write_stats(fh, key + '_' + k, filterLogStat[key][k], log) bbversionCmd = os.path.join(BBDIR, 'bbversion.sh') cmd = "%s" % (bbversionCmd) stdOut, _, exitCode = run_sh_command(cmd, True, log) assert stdOut is not None bbtoolsVersion = stdOut.strip() ## 05112017 Now bbtools version = bbmap version # bbtoolsVersion = bbmapVersion if bbmapVersion else "37.xx" assert bbtoolsVersion is not None write_stats(fh, "filter_tool", "bbtools " + bbtoolsVersion, log) write_stats(fh, "filter", VERSION, log) ## Version recording if bbdukVersion is None: bbdukVersion = bbtoolsVersion if bbmapVersion is None: bbmapVersion = bbtoolsVersion write_stats(fh, "bbduk_version", bbdukVersion, log) write_stats(fh, "bbmap_version", bbmapVersion, log) checkpoint(POST_END, status) status = POST_END else: log_and_print('No need to do post processing.') return filteredFastq, status
def run_rqcfilter(infastq, outDir, prodType, status, enableRmoveMicrobes, enableAggressive, disableRmoveMicrobes, disableClumpify, taxList, rdb, log): log_and_print( "\n\n%s - RUN RQCFILTER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n" % (color['pink'], color[''])) make_dir_p(outDir) opt = None extraOptions = "" optionFile = "" if prodType.endswith("-OLD"): extraOptions = " barcodefilter=f chastityfilter=f" prodType = prodType.replace("-OLD", "") if prodType in FILTER_METHODS_TXT: optionFile = os.path.join( SRC_ROOT, "filter_param/", FILTER_METHODS_TXT[prodType].replace(".txt", ".config")) log_and_print("Filter option file: %s" % optionFile) opt = open(optionFile, 'r').readline().rstrip() else: log_and_print("The product type, %s, is not supported yet." % prodType) sys.exit(2) assert (opt), "Null filter options." if prodType in ("METATRANSCRIPTOME", "MTF"): if infastq.endswith(".gz"): opt += " outribo=%s " % (os.path.basename(infastq).replace( ".fastq", ".rRNA.fastq")) else: opt += " outribo=%s " % (os.path.basename(infastq).replace( ".fastq", ".rRNA.fastq.gz")) if enableRmoveMicrobes: if opt.find("removemicrobes=f") != -1: opt = opt.replace("removemicrobes=f", "removemicrobes=t") opt += " removemicrobes=t " if disableRmoveMicrobes: if opt.find("removemicrobes=t") != -1: opt = opt.replace("removemicrobes=t", "removemicrobes=f") else: opt += " removemicrobes=f " if enableAggressive: opt += " aggressive=t microbebuild=3 " if taxList: opt += " taxlist=%s " % (taxList) ## Temp set clumpify=t for all prod types (RQC-890) if not disableClumpify: opt += " clumpify=t " else: opt += " clumpify=f " opt += " tmpdir=null " opt += extraOptions cmd = os.path.join(BBDIR, "rqcfilter.sh") filterLogFile = os.path.join(outDir, "filter.log") cmdStr = "%s in=%s path=%s %s usejni=f rqcfilterdata=%s > %s 2>&1" % ( cmd, infastq, outDir, opt, rdb, filterLogFile) rtn = [None, status] outFastqFile = None shFileName = "%s/filter.sh" % outDir def find_filtered_fastq(adir): outFastqFile = None searchPatt = os.path.join(adir, "*.fastq.gz") outFastqFileList = glob.glob(searchPatt) assert len(outFastqFileList ) >= 1, "ERROR: cannot find *.fastq.gz output file." for f in outFastqFileList: f = os.path.basename(f) t = f.split(".") if t[-3] not in ("frag", "singleton", "unknown", "rRNA", "lmp"): filterCode = t[-3] elif t[-3] == "lmp": ## nextera filterCode = t[-4] if len(t) == 7: ## ex) 12345.1.1234.ACCCC.anqdpht.fastq.gz fileNamePrefix = '.'.join(t[:4]) elif len(t) == 6: ## ex) 6176.5.39297.anqrpht.fastq.gz fileNamePrefix = '.'.join(t[:3]) else: log.warning("Unexpected filtered file name, %s", outFastqFileList) fileNamePrefix = '.'.join(t[:-3]) log_and_print("Use %s as file prefix." % fileNamePrefix) assert filterCode and fileNamePrefix, "ERROR: unexpected filter file name: %s" % ( outFastqFileList) of = os.path.join(adir, '.'.join([fileNamePrefix, filterCode, "fastq.gz"])) lof = os.path.join( adir, '.'.join([fileNamePrefix, filterCode, "lmp.fastq.gz"])) if os.path.isfile(of): outFastqFile = of elif os.path.isfile(lof): outFastqFile = lof else: log.error("Cannot find fastq.gz file.") # rename output file to *.filtered.fastq.gz f = os.path.basename(outFastqFile) t = f.split(".") if t[-3] != 'filtered': fto = os.path.join( adir, '.'.join(['.'.join(t[:-3]), 'filtered', "fastq.gz"])) shutil.move(outFastqFile, fto) outFastqFile = fto return outFastqFile def find_filter_number(outFastqFile): filteredReadNum = fastqUtil.check_fastq_format(outFastqFile) / 4 if filteredReadNum < 0: log_and_print("RUN RQCFILTER - filtered fastq format error: %s." % outFastqFile) return filteredReadNum if STEP_ORDER[status] < STEP_ORDER[RQCFILTER_END]: create_shell(shFileName, (cmdStr, )) log_and_print("rqcfilter cmd=[%s]" % cmdStr) log_and_print("sh file name=[%s]" % shFileName) stdOut, stdErr, exitCode = run_sh_command( shFileName, True, log, True) ## stdOut of 0 is success if exitCode != 0: log.error("Failed to run : %s, stdout : %s, stderr: %s", shFileName, stdOut, stdErr) return rtn outFastqFile = find_filtered_fastq(outDir) filteredReadNum = find_filter_number(outFastqFile) log_and_print("Read counts after RQCFILTER step = %d" % filteredReadNum) log_and_print("RUN RQCFILTER - completed") checkpoint(RQCFILTER_END, status) status = RQCFILTER_END if filteredReadNum == 0: log.warning("No reads left after filtering") checkpoint(PIPE_COMPLETE, status) with open(BB_STATS_LIST_FILE_NAME, 'a') as fh: write_stats(fh, FILTER_READ_COUNT, 0, log) write_stats(fh, FILTER_READ_BASE_COUNT, 0, log) else: log_and_print( "No need to rerun RQCFILTER step, get filtered files and stats ... " ) outFastqFile = find_filtered_fastq(outDir) rtn = [outFastqFile, status] return rtn