def do_html(odir, qcdir, rawFastq, filteredFastq, status): log_and_print( "\n\n%s - Create HTML file <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n" % (color['pink'], color[''])) fname = os.path.basename(rawFastq) stats = get_dict_obj(os.path.join(odir, STATS_LIST_FILE_NAME)) temp = os.path.join(PYDIR, 'template/template.html') with open(temp, 'r') as fh: html = fh.read() html = html.replace('[_PAGE-TITLE_]', 'Filter Report') html = html.replace('[_REPORT-TITLE_]', 'BBTools Filtering Report') html = html.replace('[_INPUT-FILE-NAME_]', fname) html = html.replace( '[_REPORT-DATE_]', '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())) hbody = do_html_body(odir, rawFastq, filteredFastq) html = html.replace('[_REPORT-BODY_]', hbody) fbasename = 'filter.log' fname = os.path.join(outputPath, fbasename) html = html.replace('[_FILTER-LOG_]', html_tag('a', fbasename, {'href': fbasename})) fsize = '%.1f' % (float(os.stat(fname).st_size) / 2014.0) html = html.replace('[_FILTER-LOG-SIZE_]', fsize) # fbasename = 'filter.txt' # fname = os.path.join(outputPath, fbasename) # html = html.replace('[_FILTER-REPORT_]', html_tag('a', fbasename, {'href': fbasename})) # fsize = '%.1f' % (float(os.stat(fname).st_size) / 2014.0) # html = html.replace('[_FILTER-REPORT-SIZE_]', fsize) ## write the html to file idxfile = os.path.join(odir, 'index.html') with open(idxfile, 'w') as fh2: fh2.write(html) print('HTML index file written to %s' % idxfile) # copy the css file cssdir = os.path.join(PYDIR, 'css') todir = os.path.join(odir, 'css') if os.path.isdir(todir): shutil.rmtree(todir) shutil.copytree(cssdir, todir, False, None) # copy the image file imgdir = os.path.join(PYDIR, 'images') todir = os.path.join(odir, 'images') if os.path.isdir(todir): shutil.rmtree(todir) shutil.copytree(imgdir, todir, False, None) return status
def do_html_body(odir, rawFastq, filteredFastq): stats = get_dict_obj(os.path.join(odir, STATS_LIST_FILE_NAME)) tok_map = { 'inputReads' : {'token' : '[_RAW-READ-CNT_]', 'type': 'bigint'}, 'inputBases' : {'token' : '[_RAW-BASE-CNT_]', 'type': 'bigint'}, 'outputReads' : {'token' : '[_FILTERED-READ-CNT_]', 'type': 'bigint'}, 'outputBases' : {'token' : '[_FILTERED-BASE-CNT_]', 'type': 'bigint'}, 'readRmPct' : {'token' : '[_REMOVED-READ-PCT_]', 'type': 'raw'}, 'baseRmPct' : {'token' : '[_REMOVED-BASE-PCT_]', 'type': 'raw'}, 'lowqualitydiscards_numreads' : {'token' : '[_LOW-QUAL-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'lowqualitydiscards_percreads' : {'token' : '[_LOW-QUAL-REMOVED-PCT_]', 'type': 'raw', 'filter': 0}, 'contaminants_numreads' : {'token' : '[_ARTI-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'contaminants_percreads' : {'token' : '[_ARTI-REMOVED-PCT_]', 'type': 'raw', 'filter': 0}, 'ribosomalsequenceremoved_numreads' : {'token' : '[_RRNA-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'ribosomalsequenceremoved_percreads' : {'token' : '[_RRNA-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0}, 'microbialremoved_numreads' : {'token' : '[_MICROBE-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'microbialremoved_percreads' : {'token' : '[_MICROBE-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0}, 'human_unambiguousreads' : {'token' : '[_HUMAN-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'human_unambiguousreadsperc' : {'token' : '[_HUMAN-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0}, 'dog_unambiguousreads' : {'token' : '[_DOG-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'dog_unambiguousreadsperc' : {'token' : '[_DOG-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0}, 'cat_unambiguousreads' : {'token' : '[_CAT-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'cat_unambiguousreadsperc' : {'token' : '[_CAT-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0}, 'mouse_unambiguousreads' : {'token' : '[_MOUSE-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0}, 'mouse_unambiguousreadsperc' : {'token' : '[_MOUSE-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0}, } temp = os.path.join(PYDIR, 'template/filter_body_template.html') html = '' with open(temp, 'r') as fh: html = fh.read() ## do the place-holder replacement !! html = html.replace('[_RAW-FILE-LOCATION_]', rawFastq) html = html.replace('[_FILTERED-FILE-LOCATION_]', filteredFastq) fsize = format(os.stat(rawFastq).st_size / (1024*1024), ',') html = html.replace('[_RAW-FILE-SIZE_]', fsize) fsize = format(os.stat(filteredFastq).st_size / (1024*1024), ',') html = html.replace('[_FILTERED-FILE-SIZE_]', fsize) for key in tok_map: dat = tok_map[key] html = html.replace(dat['token'], pipeline_val(key, dat, stats)) # readqc on the filter file if qcdir: hbody = do_readqc_html_body(qcdir, odir) else: hbody = '' html = html.replace('[_FILTERED-READ-QC_]', hbody) return html
def post_process(fastq, outDir, filteredFastq, status, log): ## obtain read counts from input and filtered fastq files and save the values to STATS_LIST_FILE_NAME file; ## compress the filtered fastq file log_and_print( "\n\n%s - RUN POST PROCESS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n" % (color['pink'], color[''])) if STEP_ORDER[status] < STEP_ORDER[POST_END]: checkpoint(POST_START, status) rawCnt = 0 rawBaseCnt = 0 newCnt = 0 newBaseCnt = 0 stats = get_dict_obj(BB_STATS_LIST_FILE_NAME) rawCnt = pipeline_val('inputReads', { 'type': 'int', 'vtype': 'numeric' }, stats) rawBaseCnt = pipeline_val('inputBases', { 'type': 'int', 'vtype': 'numeric' }, stats) newCnt = pipeline_val('outputReads', { 'type': 'int', 'vtype': 'numeric' }, stats) newBaseCnt = pipeline_val('outputBases', { 'type': 'int', 'vtype': 'numeric' }, stats) readCounts = {} readRmPct = 100.0 * ((rawCnt - newCnt) / float(rawCnt)) baseRmPct = 100.0 * ((rawBaseCnt - newBaseCnt) / float(rawBaseCnt)) readCounts['readRmPct'] = '%.3f' % readRmPct readCounts['baseRmPct'] = '%.3f' % baseRmPct refStats = {} filterLogStat = {} cardinality = None bbdukVersion = None bbmapVersion = None if os.path.isfile("filter.log"): with open(os.path.join(outDir, "filter.log"), "r") as FLFH: isContamNumChecked = False ## Contamination will be done twice for removeribo or for MTF isKtrimmedTotalRemovedNumChecked = False ## for parsing "Total Removed" after ktrimming for l in FLFH: if l.startswith("Input:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 2 if 'adaptertriminput' not in filterLogStat: filterLogStat["adaptertriminput"] = { "numreads": toks[0], "numbases": toks[1] } elif 'contamtriminput' not in filterLogStat: filterLogStat["contamtriminput"] = { "numreads": toks[0], "numbases": toks[1] } elif l.startswith("FTrimmed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ftrimmed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("KTrimmed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ktrimmed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } isKtrimmedTotalRemovedNumChecked = True ## RQCSUPPORT-1987 elif l.startswith("Total Removed:" ) and isKtrimmedTotalRemovedNumChecked: toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ktrimmed_total_removed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } isKtrimmedTotalRemovedNumChecked = False elif l.startswith("Trimmed by overlap:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["trimmedbyoverlap"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Result:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 if 'adaptertrimresult' not in filterLogStat: filterLogStat["adaptertrimresult"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif 'contamtrimresult' not in filterLogStat: filterLogStat["contamtrimresult"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Unique 31-mers:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 2 or len(toks) == 1 if 'adaptertrimunique31mers' not in filterLogStat: if len(toks) == 2: filterLogStat["adaptertrimunique31mers"] = { "num": toks[1] } else: filterLogStat["adaptertrimunique31mers"] = { "num": "0" } else: if len(toks) == 2: filterLogStat["contamtrimunique31mers"] = { "num": toks[1] } else: filterLogStat["contamtrimunique31mers"] = { "num": "0" } elif not isContamNumChecked and l.startswith( "Contaminants:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["contaminants"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } isContamNumChecked = True elif l.startswith("QTrimmed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["qtrimmed"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Short Read Discards:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["shortreaddiscards"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Low quality discards:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["lowqualitydiscards"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("BBDuk version"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 1 bbdukVersion = toks[0] elif l.startswith("BBMap version"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 1 bbmapVersion = toks[0] ## BBDuk 36.12 06272016 elif l.startswith("Adapter Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["adaptersequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Synthetic Contam Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["syntheticcontamsequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## 08112016 elif l.startswith( "Short Synthetic Contam Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat[ "shortsyntheticcontamsequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } elif l.startswith("Ribosomal Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["ribosomalsequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## BBMap 36.12 06272016 elif l.startswith("Human Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["humansequenceremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## RQC-862, RQC-880 elif l.startswith("Microbial Sequence Removed:"): toks = re.findall("(\d+.\d*)", l.rstrip()) assert len(toks) == 4 filterLogStat["microbialremoved"] = { "numreads": toks[0], "percreads": toks[1], "numbases": toks[2], "percbases": toks[3] } ## ## refStats.txt format ## ## name %unambiguousReads unambiguousMB %ambiguousReads ambiguousMB unambiguousReads ambiguousReads ## human_masked 85.24693 498.92052 0.09378 0.55290 3350692 3686 ## mouse_masked 0.03765 0.21670 0.10802 0.63690 1480 4246 ## cat_masked 0.01862 0.09568 0.02514 0.14820 732 988 ## dog_masked 0.00697 0.03815 0.01384 0.08160 274 544 ## if os.path.isfile("refStats.txt"): refStatsFile = os.path.join(outDir, "refStats.txt") with open(refStatsFile) as RFH: ## Need to report 0 if nothing matched refStats['human'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } refStats['cat'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } refStats['dog'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } refStats['mouse'] = { "unambiguousReadsPerc": "0", "unambiguousMB": "0", "ambiguousReadsPerc": "0", "ambiguousMB": "0", "unambiguousReads": "0", "ambiguousReads": "0", "totalPerc": "0" } for l in RFH: if l: if l.startswith("#"): continue toks = l.rstrip().split() assert len(toks) == 7 ## the number and percent of reads that map unambiguously or ambiguously to human, cat, dog. ## take the sum of the two numbers (ambiguous plus unambiguous) to use as the final percentage. if l.startswith("human"): refStats['human'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } if l.startswith("cat"): refStats['cat'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } if l.startswith("dog"): refStats['dog'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } if l.startswith("mouse"): refStats['mouse'] = { "unambiguousReadsPerc": toks[1], "unambiguousMB": toks[2], "ambiguousReadsPerc": toks[3], "ambiguousMB": toks[4], "unambiguousReads": toks[5], "ambiguousReads": toks[6], "totalPerc": float(toks[3]) + float(toks[1]) } log.debug("refStats.txt: %s", str(refStats)) ########################################################### log_and_print("Write to stats file %s" % STATS_LIST_FILE_NAME) ########################################################### if os.path.isfile(STATS_LIST_FILE_NAME): os.remove(STATS_LIST_FILE_NAME) with open(BB_STATS_LIST_FILE_NAME) as bbfh: with open(STATS_LIST_FILE_NAME, 'a') as fh: for line in bbfh: if not line.startswith("#") and line.strip(): fh.write(line) bbtoolsVersion = None stats = get_dict_obj(STATS_LIST_FILE_NAME) with open(STATS_LIST_FILE_NAME, 'a') as fh: for key in readCounts: if key not in stats: write_stats(fh, key, readCounts[key], log) for key in refStats: for k in refStats[key]: write_stats(fh, key + '_' + k, refStats[key][k], log) write_stats(fh, "cardinality", cardinality, log) ## Write refStats to filterStats.txt file for key in filterLogStat: for k in filterLogStat[key]: write_stats(fh, key + '_' + k, filterLogStat[key][k], log) bbversionCmd = os.path.join(BBDIR, 'bbversion.sh') cmd = "%s" % (bbversionCmd) stdOut, _, exitCode = run_sh_command(cmd, True, log) assert stdOut is not None bbtoolsVersion = stdOut.strip() ## 05112017 Now bbtools version = bbmap version # bbtoolsVersion = bbmapVersion if bbmapVersion else "37.xx" assert bbtoolsVersion is not None write_stats(fh, "filter_tool", "bbtools " + bbtoolsVersion, log) write_stats(fh, "filter", VERSION, log) ## Version recording if bbdukVersion is None: bbdukVersion = bbtoolsVersion if bbmapVersion is None: bbmapVersion = bbtoolsVersion write_stats(fh, "bbduk_version", bbdukVersion, log) write_stats(fh, "bbmap_version", bbmapVersion, log) checkpoint(POST_END, status) status = POST_END else: log_and_print('No need to do post processing.') return filteredFastq, status