Example #1
0
def do_html(odir, qcdir, rawFastq, filteredFastq, status):
    log_and_print(
        "\n\n%s - Create HTML file <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n"
        % (color['pink'], color['']))
    fname = os.path.basename(rawFastq)

    stats = get_dict_obj(os.path.join(odir, STATS_LIST_FILE_NAME))

    temp = os.path.join(PYDIR, 'template/template.html')
    with open(temp, 'r') as fh:
        html = fh.read()
        html = html.replace('[_PAGE-TITLE_]', 'Filter Report')
        html = html.replace('[_REPORT-TITLE_]', 'BBTools Filtering Report')
        html = html.replace('[_INPUT-FILE-NAME_]', fname)
        html = html.replace(
            '[_REPORT-DATE_]',
            '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))

        hbody = do_html_body(odir, rawFastq, filteredFastq)
        html = html.replace('[_REPORT-BODY_]', hbody)

        fbasename = 'filter.log'
        fname = os.path.join(outputPath, fbasename)
        html = html.replace('[_FILTER-LOG_]',
                            html_tag('a', fbasename, {'href': fbasename}))
        fsize = '%.1f' % (float(os.stat(fname).st_size) / 2014.0)
        html = html.replace('[_FILTER-LOG-SIZE_]', fsize)

        # fbasename = 'filter.txt'
        # fname = os.path.join(outputPath, fbasename)
        # html = html.replace('[_FILTER-REPORT_]', html_tag('a', fbasename, {'href': fbasename}))
        # fsize = '%.1f' % (float(os.stat(fname).st_size) / 2014.0)
        # html = html.replace('[_FILTER-REPORT-SIZE_]', fsize)

        ## write the html to file
        idxfile = os.path.join(odir, 'index.html')
        with open(idxfile, 'w') as fh2:
            fh2.write(html)
        print('HTML index file written to %s' % idxfile)

        # copy the css file
        cssdir = os.path.join(PYDIR, 'css')
        todir = os.path.join(odir, 'css')
        if os.path.isdir(todir):
            shutil.rmtree(todir)
        shutil.copytree(cssdir, todir, False, None)

        # copy the image file
        imgdir = os.path.join(PYDIR, 'images')
        todir = os.path.join(odir, 'images')
        if os.path.isdir(todir):
            shutil.rmtree(todir)
        shutil.copytree(imgdir, todir, False, None)

    return status
Example #2
0
def do_html_body(odir, rawFastq, filteredFastq):

    stats = get_dict_obj(os.path.join(odir, STATS_LIST_FILE_NAME))
    tok_map = {
            'inputReads' : {'token' : '[_RAW-READ-CNT_]', 'type': 'bigint'},
            'inputBases' : {'token' : '[_RAW-BASE-CNT_]', 'type': 'bigint'},
            'outputReads' : {'token' : '[_FILTERED-READ-CNT_]', 'type': 'bigint'},
            'outputBases' : {'token' : '[_FILTERED-BASE-CNT_]', 'type': 'bigint'},
            'readRmPct' : {'token' : '[_REMOVED-READ-PCT_]', 'type': 'raw'},
            'baseRmPct' : {'token' : '[_REMOVED-BASE-PCT_]', 'type': 'raw'},

            'lowqualitydiscards_numreads' : {'token' : '[_LOW-QUAL-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'lowqualitydiscards_percreads' : {'token' : '[_LOW-QUAL-REMOVED-PCT_]', 'type': 'raw', 'filter': 0},

            'contaminants_numreads' : {'token' : '[_ARTI-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'contaminants_percreads' : {'token' : '[_ARTI-REMOVED-PCT_]', 'type': 'raw', 'filter': 0},
            'ribosomalsequenceremoved_numreads' : {'token' : '[_RRNA-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'ribosomalsequenceremoved_percreads' : {'token' : '[_RRNA-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0},
            'microbialremoved_numreads' : {'token' : '[_MICROBE-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'microbialremoved_percreads' : {'token' : '[_MICROBE-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0},
            'human_unambiguousreads' : {'token' : '[_HUMAN-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'human_unambiguousreadsperc' : {'token' : '[_HUMAN-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0},
            'dog_unambiguousreads' : {'token' : '[_DOG-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'dog_unambiguousreadsperc' : {'token' : '[_DOG-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0},
            'cat_unambiguousreads' : {'token' : '[_CAT-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'cat_unambiguousreadsperc' : {'token' : '[_CAT-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0},
            'mouse_unambiguousreads' : {'token' : '[_MOUSE-REMOVED-READ-CNT_]', 'type': 'bigint', 'filter': 0},
            'mouse_unambiguousreadsperc' : {'token' : '[_MOUSE-REMOVED-READ-PCT_]', 'type': 'raw', 'filter': 0},
    }

    temp = os.path.join(PYDIR, 'template/filter_body_template.html')
    html = ''
    with open(temp, 'r') as fh:
        html = fh.read()

        ## do the place-holder replacement !!
        html = html.replace('[_RAW-FILE-LOCATION_]', rawFastq)
        html = html.replace('[_FILTERED-FILE-LOCATION_]', filteredFastq)
        fsize = format(os.stat(rawFastq).st_size / (1024*1024), ',')
        html = html.replace('[_RAW-FILE-SIZE_]', fsize)
        fsize = format(os.stat(filteredFastq).st_size / (1024*1024), ',')
        html = html.replace('[_FILTERED-FILE-SIZE_]', fsize)

        for key in tok_map:
            dat = tok_map[key]
            html = html.replace(dat['token'], pipeline_val(key, dat, stats))

        # readqc on the filter file
        if qcdir:
            hbody = do_readqc_html_body(qcdir, odir)
        else:
            hbody = ''
        html = html.replace('[_FILTERED-READ-QC_]', hbody)
    return html
Example #3
0
def post_process(fastq, outDir, filteredFastq, status, log):
    ## obtain read counts from input and filtered fastq files and save the values to STATS_LIST_FILE_NAME file;
    ## compress the filtered fastq file
    log_and_print(
        "\n\n%s - RUN POST PROCESS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n"
        % (color['pink'], color['']))
    if STEP_ORDER[status] < STEP_ORDER[POST_END]:
        checkpoint(POST_START, status)
        rawCnt = 0
        rawBaseCnt = 0
        newCnt = 0
        newBaseCnt = 0

        stats = get_dict_obj(BB_STATS_LIST_FILE_NAME)
        rawCnt = pipeline_val('inputReads', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)
        rawBaseCnt = pipeline_val('inputBases', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)
        newCnt = pipeline_val('outputReads', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)
        newBaseCnt = pipeline_val('outputBases', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)

        readCounts = {}

        readRmPct = 100.0 * ((rawCnt - newCnt) / float(rawCnt))
        baseRmPct = 100.0 * ((rawBaseCnt - newBaseCnt) / float(rawBaseCnt))
        readCounts['readRmPct'] = '%.3f' % readRmPct
        readCounts['baseRmPct'] = '%.3f' % baseRmPct

        refStats = {}
        filterLogStat = {}

        cardinality = None
        bbdukVersion = None
        bbmapVersion = None

        if os.path.isfile("filter.log"):

            with open(os.path.join(outDir, "filter.log"), "r") as FLFH:
                isContamNumChecked = False  ## Contamination will be done twice for removeribo or for MTF
                isKtrimmedTotalRemovedNumChecked = False  ## for parsing "Total Removed" after ktrimming

                for l in FLFH:
                    if l.startswith("Input:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 2
                        if 'adaptertriminput' not in filterLogStat:
                            filterLogStat["adaptertriminput"] = {
                                "numreads": toks[0],
                                "numbases": toks[1]
                            }
                        elif 'contamtriminput' not in filterLogStat:
                            filterLogStat["contamtriminput"] = {
                                "numreads": toks[0],
                                "numbases": toks[1]
                            }

                    elif l.startswith("FTrimmed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ftrimmed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                    elif l.startswith("KTrimmed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ktrimmed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                        isKtrimmedTotalRemovedNumChecked = True

                    ## RQCSUPPORT-1987
                    elif l.startswith("Total Removed:"
                                      ) and isKtrimmedTotalRemovedNumChecked:
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ktrimmed_total_removed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                        isKtrimmedTotalRemovedNumChecked = False

                    elif l.startswith("Trimmed by overlap:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["trimmedbyoverlap"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    elif l.startswith("Result:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        if 'adaptertrimresult' not in filterLogStat:
                            filterLogStat["adaptertrimresult"] = {
                                "numreads": toks[0],
                                "percreads": toks[1],
                                "numbases": toks[2],
                                "percbases": toks[3]
                            }
                        elif 'contamtrimresult' not in filterLogStat:
                            filterLogStat["contamtrimresult"] = {
                                "numreads": toks[0],
                                "percreads": toks[1],
                                "numbases": toks[2],
                                "percbases": toks[3]
                            }

                    elif l.startswith("Unique 31-mers:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 2 or len(toks) == 1
                        if 'adaptertrimunique31mers' not in filterLogStat:
                            if len(toks) == 2:
                                filterLogStat["adaptertrimunique31mers"] = {
                                    "num": toks[1]
                                }
                            else:
                                filterLogStat["adaptertrimunique31mers"] = {
                                    "num": "0"
                                }
                        else:
                            if len(toks) == 2:
                                filterLogStat["contamtrimunique31mers"] = {
                                    "num": toks[1]
                                }
                            else:
                                filterLogStat["contamtrimunique31mers"] = {
                                    "num": "0"
                                }

                    elif not isContamNumChecked and l.startswith(
                            "Contaminants:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["contaminants"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                        isContamNumChecked = True

                    elif l.startswith("QTrimmed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["qtrimmed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    elif l.startswith("Short Read Discards:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["shortreaddiscards"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                    elif l.startswith("Low quality discards:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["lowqualitydiscards"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    elif l.startswith("BBDuk version"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 1
                        bbdukVersion = toks[0]
                    elif l.startswith("BBMap version"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 1
                        bbmapVersion = toks[0]

                    ## BBDuk 36.12 06272016
                    elif l.startswith("Adapter Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["adaptersequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                    elif l.startswith("Synthetic Contam Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["syntheticcontamsequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    ## 08112016
                    elif l.startswith(
                            "Short Synthetic Contam Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat[
                            "shortsyntheticcontamsequenceremoved"] = {
                                "numreads": toks[0],
                                "percreads": toks[1],
                                "numbases": toks[2],
                                "percbases": toks[3]
                            }

                    elif l.startswith("Ribosomal Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ribosomalsequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    ## BBMap 36.12 06272016
                    elif l.startswith("Human Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["humansequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    ## RQC-862, RQC-880
                    elif l.startswith("Microbial Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["microbialremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

        ##
        ## refStats.txt format
        ##
        ## name %unambiguousReads   unambiguousMB   %ambiguousReads ambiguousMB unambiguousReads    ambiguousReads
        ## human_masked 85.24693    498.92052   0.09378 0.55290 3350692 3686
        ## mouse_masked 0.03765 0.21670 0.10802 0.63690 1480    4246
        ## cat_masked   0.01862 0.09568 0.02514 0.14820 732 988
        ## dog_masked   0.00697 0.03815 0.01384 0.08160 274 544
        ##
        if os.path.isfile("refStats.txt"):
            refStatsFile = os.path.join(outDir, "refStats.txt")
            with open(refStatsFile) as RFH:
                ## Need to report 0 if nothing matched
                refStats['human'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }
                refStats['cat'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }
                refStats['dog'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }
                refStats['mouse'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }

                for l in RFH:
                    if l:
                        if l.startswith("#"):
                            continue

                        toks = l.rstrip().split()
                        assert len(toks) == 7

                        ## the number and percent of reads that map unambiguously or ambiguously to human, cat, dog.
                        ## take the sum of the two numbers (ambiguous plus unambiguous) to use as the final percentage.
                        if l.startswith("human"):
                            refStats['human'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }
                        if l.startswith("cat"):
                            refStats['cat'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }
                        if l.startswith("dog"):
                            refStats['dog'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }
                        if l.startswith("mouse"):
                            refStats['mouse'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }

            log.debug("refStats.txt: %s", str(refStats))

        ###########################################################
        log_and_print("Write to stats file %s" % STATS_LIST_FILE_NAME)
        ###########################################################
        if os.path.isfile(STATS_LIST_FILE_NAME):
            os.remove(STATS_LIST_FILE_NAME)

        with open(BB_STATS_LIST_FILE_NAME) as bbfh:
            with open(STATS_LIST_FILE_NAME, 'a') as fh:
                for line in bbfh:
                    if not line.startswith("#") and line.strip():
                        fh.write(line)

        bbtoolsVersion = None

        stats = get_dict_obj(STATS_LIST_FILE_NAME)
        with open(STATS_LIST_FILE_NAME, 'a') as fh:
            for key in readCounts:
                if key not in stats:
                    write_stats(fh, key, readCounts[key], log)

            for key in refStats:
                for k in refStats[key]:
                    write_stats(fh, key + '_' + k, refStats[key][k], log)

            write_stats(fh, "cardinality", cardinality, log)

            ## Write refStats to filterStats.txt file
            for key in filterLogStat:
                for k in filterLogStat[key]:
                    write_stats(fh, key + '_' + k, filterLogStat[key][k], log)

            bbversionCmd = os.path.join(BBDIR, 'bbversion.sh')
            cmd = "%s" % (bbversionCmd)
            stdOut, _, exitCode = run_sh_command(cmd, True, log)
            assert stdOut is not None
            bbtoolsVersion = stdOut.strip()

            ## 05112017 Now bbtools version = bbmap version
            # bbtoolsVersion = bbmapVersion if bbmapVersion else "37.xx"
            assert bbtoolsVersion is not None
            write_stats(fh, "filter_tool", "bbtools " + bbtoolsVersion, log)
            write_stats(fh, "filter", VERSION, log)

            ## Version recording
            if bbdukVersion is None: bbdukVersion = bbtoolsVersion
            if bbmapVersion is None: bbmapVersion = bbtoolsVersion
            write_stats(fh, "bbduk_version", bbdukVersion, log)
            write_stats(fh, "bbmap_version", bbmapVersion, log)

        checkpoint(POST_END, status)
        status = POST_END
    else:
        log_and_print('No need to do post processing.')

    return filteredFastq, status