コード例 #1
0
ファイル: rqc_utility.py プロジェクト: nickp60/EToKi
def submit_qsub(qsub_opt, qsub_cmd, qsub_name, output_path, log):
    job_id = 0
    job_share = 150

    cluster_project = "gentech-rqc.p"
    qsub_path = os.path.join(output_path, "qsub")

    if not os.path.isdir(qsub_path):
        os.makedirs(qsub_path)

    output_log = os.path.join(qsub_path, "qsub-%s.log" % qsub_name)

    my_qsub = "qsub -b y -j y -m n -w e -terse -N %s -P %s -o %s -js %s %s '%s'" % (
        qsub_name, cluster_project, output_log, job_share, qsub_opt, qsub_cmd)
    # append to qsub.txt

    cmd = "module load uge;%s" % my_qsub
    stdOut, stdErr, exitCode = run_sh_command(cmd, True, log)
    post_mortem_cmd(cmd, exitCode, stdOut, stdErr, log)

    if exitCode == 0:
        job_id = int(stdOut.strip())

    log.info("- cluster job id: %s", job_id)

    if job_id > 0:
        qsub_log = os.path.join(output_path, "qsub_list.txt")
        fh = open(qsub_log, "a")
        fh.write("%s,%s\n" % (job_id, "submitted"))
        fh.close()

    return job_id
コード例 #2
0
def read_qc(odir, fastq, status):
    log_and_print(
        "\n\n%s - RUN QC <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n"
        % (color['pink'], color['']))
    if not os.path.isfile(fastq):
        return False, status

    qcdir = os.path.join(odir, READQC_DIR)

    if STEP_ORDER[status] < STEP_ORDER[QC_END]:
        checkpoint(QC_START, status)

        qcexe = os.path.join(os.path.dirname(__file__), 'readqc.py')
        cmd = '%s -o %s -f %s --skip-blast' % (qcexe, qcdir, fastq)
        # print('DEBUG : %s' % cmd)
        stdOut, stdErr, exitCode = run_sh_command(cmd, True)
        if exitCode != 0:
            print('ERROR : %s' % stdErr)
            return False, qcdir, status

        checkpoint(QC_END, status)
        status = QC_END
    else:
        log_and_print("No need to do qc step.")

    return True, qcdir, status
コード例 #3
0
ファイル: rqc_utility.py プロジェクト: nickp60/EToKi
def localize_file(fileNameFullPath, log):
    done = 0
    loopCount = 0

    while done == 0:
        loopCount += 1

        if os.path.isfile(fileNameFullPath):
            done = 1  # congratulations!  The file system seems to be working for the moment
        else:
            sleepTime = 60 * loopCount
            log.error(
                "Filename doesn't exist on the system: %s, sleeping for %s seconds.",
                fileNameFullPath, sleepTime)
            time.sleep(sleepTime)

        if loopCount > 3:
            done = 1  # probably failed

    if not os.path.isfile(fileNameFullPath):
        return None

    destinationPath = "/scratch/rqc/localized-file"
    if not os.path.isdir(destinationPath):
        # make_dir_p(destinationPath)
        _, _, exitCode = run_sh_command(
            "mkdir -p /scratch/rqc/localized-file && chmod 777 /scratch/rqc/localized-file",
            True, log, True)
        assert exitCode == 0

    fileName = safe_basename(fileNameFullPath, log)[0]
    localizeCmd = "rsync -av --omit-dir-times --no-perms " + fileNameFullPath + " " + os.path.join(
        destinationPath, fileName)

    _, _, exitCode = run_sh_command(localizeCmd, True, log, True)

    localizedFileNameFullPath = None

    if exitCode == RQCExitCodes.JGI_SUCCESS:
        localizedFileNameFullPath = os.path.join(destinationPath, fileName)
        log.info("File localization completed for %s" % (fileName))
    else:
        localizedFileNameFullPath = None
        log.error("File localization failed for %s" % (fileName))

    return localizedFileNameFullPath
コード例 #4
0
ファイル: rqc_utility.py プロジェクト: nickp60/EToKi
def watch_cluster(output_path, log, sleep_time=300):
    log.info("watch_cluster")

    is_job_complete = "/usr/common/usg/bin/isjobcomplete.new"  # + job_id = nice way to ask if job is done
    qsub_log = os.path.join(output_path, "qsub_list.txt")

    #sleep_time = 300 # check isjobrunning every xx seconds

    hb_max = (
        180 * 3600
    ) / sleep_time  # total number of heartbeats before we give up, 180 hours worth of run time
    #hb_max = 5 # temp

    done = 0
    hb_cnt = 0  # heartbeat count

    if not os.path.isfile(qsub_log):
        done = 1

    while done == 0:

        hb_cnt += 1
        log.info("- heartbeat: %s", hb_cnt)

        qsub_list = []

        qsub_cnt = 0
        qsub_complete = 0
        qsub_err = 0
        fh = open(qsub_log, "r")
        for line in fh:

            qsub_list.append(line.strip())
            qsub_cnt += 1
        fh.close()

        fh = open(qsub_log, "w")

        for qsub in qsub_list:

            job_id, status = qsub.split(",")
            new_status = status

            if status in ("complete", "fail"):
                continue
            else:
                cmd = "%s %s" % (is_job_complete, job_id)
                stdOut, stdErr, exitCode = run_sh_command(cmd, True, log)
                #post_mortem_cmd(cmd, exitCode, stdOut, stdErr, log)

                running = "%s queued/running" % job_id
                not_running = "%s not queued/running" % job_id
                error_qw = "%s queued/running/error" % job_id

                if stdOut.strip == running:
                    new_status = "running"

                elif stdOut.strip() == not_running:
                    new_status = "complete"  # might have failed
                    qsub_complete += 1

                elif stdOut.strip() == error_qw:
                    new_status = "error"
                    qsub_err += 1

                if stdOut.strip() == "not queued/running":
                    new_status = "complete"

            fh.write("%s,%s\n" % (job_id, new_status))
            log.info("- job_id: %s, status: %s", job_id, new_status)

        fh.close()

        qsub_running = qsub_cnt - (qsub_complete + qsub_err)
        log.info("- job count: %s, running: %s, err: %s", qsub_cnt,
                 qsub_running, qsub_err)

        if qsub_cnt == (qsub_complete + qsub_err):
            done = 1

        if hb_cnt >= hb_max:
            done = 1

        if done == 0:
            time.sleep(sleep_time)
コード例 #5
0
ファイル: rqc_utility.py プロジェクト: nickp60/EToKi
def localize_dir2(db, log, bbuffer=None):
    safeBaseName, exitCode = safe_basename(db, log)
    safeDirName, exitCode = safe_dirname(db, log)
    targetScratchDir = None

    nerscDb = "/scratch/blastdb/global/dna/shared/rqc/ref_databases/ncbi/CURRENT"
    # nerscDb = "/" ## temporarily do not use NERSC db
    ## nerscDbPersistentBBuffer = "/var/opt/cray/dws/mounts/batch/NCBI_DB_striped_scratch/ncbi"

    ## check if persistent burst buffer is ready
    if bbuffer is None and "DW_PERSISTENT_STRIPED_NCBI_DB" in os.environ and os.environ[
            'DW_PERSISTENT_STRIPED_NCBI_DB'] is not None:
        nerscDb = os.path.join(os.environ['DW_PERSISTENT_STRIPED_NCBI_DB'],
                               "ncbi")

    else:
        targetScratchDir = "/scratch/rqc"

        if bbuffer is not None:
            if not os.path.isdir(bbuffer):
                log.error("Burst Buffer does not initiated: %s", bbuffer)
                return None, RQCExitCodes.JGI_FAILURE
            else:
                targetScratchDir = bbuffer
                log.info("Localization will use Burst Buffer location at %s",
                         targetScratchDir)

        elif not os.path.isdir(targetScratchDir):
            _, _, exitCode = run_sh_command(
                "mkdir %s && chmod 777 %s" %
                (targetScratchDir, targetScratchDir), True, log, True)
            assert exitCode == 0

    rsyncOption = ""
    src = db
    dest = ""

    ## For
    # GREEN_GENES = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/green_genes16s.insa_gg16S.fasta"
    # LSU_REF = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/LSURef_115_tax_silva.fasta"
    # SSU_REF = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/SSURef_NR99_115_tax_silva.fasta"
    # LSSU_REF = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/LSSURef_115_tax_silva.fasta"
    # CONTAMINANTS = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/JGIContaminants.fa"
    # COLLAB16S = "/global/dna/shared/rqc/ref_databases/misc/CURRENT/collab16s.fa"
    if db.endswith(".fa") or db.endswith(".fasta"):
        rsyncOption = "--include '*.n??' --exclude '*.fa' --exclude '*.fasta' --exclude '%s' --exclude '*.log'" % (
            safeBaseName)
        src = db + ".n??"
        dest = os.path.join(targetScratchDir, safeBaseName)
        blastDb = dest
        if os.path.isfile(
                dest):  ## just in case for the file name already exists
            rmCmd = "rm -rf %s" % (dest)
            run_sh_command(rmCmd, True, log, True)

    ## For
    # NT_maskedYindexedN_BB = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/nt/bbtools_dedupe_mask/nt_bbdedupe_bbmasked_formatted"
    elif db.endswith("nt_bbdedupe_bbmasked_formatted"):
        if os.path.isdir(os.path.join(nerscDb, "nt/bbtools_dedupe_mask")):
            blastDb = os.path.join(nerscDb, "nt/bbtools_dedupe_mask")
            log.info("NERSC NCBI Database found: %s" % (blastDb))
            return blastDb, RQCExitCodes.JGI_SUCCESS

        rsyncOption = "--include '*.n??' --exclude '*.fna' --exclude 'nt_bbdedupe_bbmasked_formatted' --exclude '*.log'"
        src = safeDirName + '/'
        dest = os.path.join(targetScratchDir, safeBaseName)
        blastDb = dest

    ## For
    # NR                   = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/nr/nr"
    # REFSEQ_ARCHAEA       = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.archaea/refseq.archaea"
    # REFSEQ_BACTERIA      = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.bacteria/refseq.bacteria"
    # REFSEQ_FUNGI         = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.fungi/refseq.fungi"
    # REFSEQ_MITOCHONDRION = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.mitochondrion/refseq.mitochondrion"
    # REFSEQ_PLANT         = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.plant/refseq.plant"
    # REFSEQ_PLASMID       = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.plasmid/refseq.plasmid"
    # REFSEQ_PLASTID       = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.plastid/refseq.plastid"
    # REFSEQ_VIRAL         = "/global/dna/shared/rqc/ref_databases/ncbi/CURRENT/refseq.viral/refseq.viral"
    else:
        if os.path.isdir(os.path.join(nerscDb, safeBaseName)):
            blastDb = os.path.join(nerscDb, safeBaseName)
            log.info("NERSC NCBI Database found: %s" % (blastDb))
            return blastDb, RQCExitCodes.JGI_SUCCESS

        rsyncOption = "--include '*.n??' --exclude '*.fna' --exclude '%s' --exclude '*.log'" % (
            safeBaseName)
        src = safeDirName + '/'
        dest = os.path.join(targetScratchDir, safeBaseName)
        blastDb = os.path.join(dest, dest)

    rsyncCmd = "rsync -av --omit-dir-times --no-perms %s %s %s" % (rsyncOption,
                                                                   src, dest)
    dbDirNameLocalized, _, exitCode = run_sh_command(rsyncCmd, True, log, True)

    if exitCode != 0:
        log.error("rsync failed. Cannot localize " + str(db))
        return dbDirNameLocalized, RQCExitCodes.JGI_FAILURE

    else:
        cmd = "chmod -f -R 777 %s" % (dest)
        _, _, exitCode = run_sh_command(cmd, True, log)

    return blastDb, RQCExitCodes.JGI_SUCCESS
コード例 #6
0
ファイル: readqc_report.py プロジェクト: nickp60/EToKi
def read_megablast_hits(db, log):
    currentDir = RQCReadQcConfig.CFG["output_path"]
    megablastDir = "megablast"
    megablastPath = os.path.join(currentDir, megablastDir)

    statsFile = RQCReadQcConfig.CFG["stats_file"]
    filesFile = RQCReadQcConfig.CFG["files_file"]

    ##
    ## Process blast output files
    ##
    matchings = 0
    hitCount = 0
    parsedFile = os.path.join(megablastPath, "megablast.*.%s*.parsed" % (db))
    matchings, _, exitCode = run_sh_command(
        "grep -v '^#' %s 2>/dev/null | wc -l " % (parsedFile), True, log)

    if exitCode == 0:  ## if parsed file found.
        t = matchings.split()

        if len(t) == 1 and t[0].isdigit():
            hitCount = int(t[0])

        append_rqc_stats(statsFile,
                         ReadqcStats.ILLUMINA_READ_MATCHING_HITS + " " + db,
                         hitCount, log)

        ##
        ## add .parsed file
        ##
        parsedFileFound, _, exitCode = run_sh_command("ls %s" % (parsedFile),
                                                      True, log)

        if parsedFileFound:
            parsedFileFound = parsedFileFound.strip()
            append_rqc_file(filesFile,
                            ReadqcStats.ILLUMINA_READ_PARSED_FILE + " " + db,
                            os.path.join(megablastPath, parsedFileFound), log)
        else:
            log.error("- Failed to add megablast parsed file of %s." % (db))
            return RQCExitCodes.JGI_FAILURE

        ##
        ## wc the top hits
        ##
        topHit = 0
        tophitFile = os.path.join(megablastPath,
                                  "megablast.*.%s*.parsed.tophit" % (db))
        tophits, _, exitCode = run_sh_command(
            "grep -v '^#' %s 2>/dev/null | wc -l " % (tophitFile), True, log)

        t = tophits.split()

        if len(t) == 1 and t[0].isdigit():
            topHit = int(t[0])

        append_rqc_stats(statsFile,
                         ReadqcStats.ILLUMINA_READ_TOP_HITS + " " + db, topHit,
                         log)

        ##
        ## wc the taxonomic species
        ##
        spe = 0
        taxlistFile = os.path.join(megablastPath,
                                   "megablast.*.%s*.parsed.taxlist" % (db))
        species, _, exitCode = run_sh_command(
            "grep -v '^#' %s 2>/dev/null | wc -l " % (taxlistFile), True, log)

        t = species.split()

        if len(t) == 1 and t[0].isdigit():
            spe = int(t[0])

        append_rqc_stats(statsFile,
                         ReadqcStats.ILLUMINA_READ_TAX_SPECIES + " " + db, spe,
                         log)

        ##
        ## wc the top 100 hit
        ##
        top100hits = 0
        top100hitFile = os.path.join(megablastPath,
                                     "megablast.*.%s*.parsed.top100hit" % (db))
        species, _, exitCode = run_sh_command(
            "grep -v '^#' %s 2>/dev/null | wc -l " % (top100hitFile), True,
            log)

        t = species.split()

        if len(t) == 1 and t[0].isdigit():
            top100hits = int(t[0])

        append_rqc_stats(statsFile,
                         ReadqcStats.ILLUMINA_READ_TOP_100HITS + " " + db,
                         top100hits, log)

        ##
        ## Find and add taxlist file
        ##
        taxListFound, _, exitCode = run_sh_command("ls %s" % (taxlistFile),
                                                   True, log)
        taxListFound = taxListFound.strip()

        if taxListFound:
            append_rqc_file(filesFile,
                            ReadqcStats.ILLUMINA_READ_TAXLIST_FILE + " " + db,
                            os.path.join(megablastPath, taxListFound), log)
        else:
            log.error("- Failed to add megablast taxlist file of %s." % (db))
            return RQCExitCodes.JGI_FAILURE

        ##
        ## Find and add tophit file
        ##
        tophitFound, _, exitCode = run_sh_command("ls %s" % (tophitFile), True,
                                                  log)
        tophitFound = tophitFound.strip()

        if tophitFound:
            append_rqc_file(filesFile,
                            ReadqcStats.ILLUMINA_READ_TOPHIT_FILE + " " + db,
                            os.path.join(megablastPath, tophitFound), log)
        else:
            log.error("- Failed to add megablast tophit file of %s." % (db))
            return RQCExitCodes.JGI_FAILURE

        ##
        ## Find and add top100hit file
        ##
        top100hitFound, _, exitCode = run_sh_command("ls %s" % (top100hitFile),
                                                     True, log)
        top100hitFound = top100hitFound.strip()

        if top100hitFound:
            append_rqc_file(
                filesFile, ReadqcStats.ILLUMINA_READ_TOP100HIT_FILE + " " + db,
                os.path.join(megablastPath, top100hitFound), log)
        else:
            log.error("- Failed to add megablast top100hit file of %s." % (db))
            return RQCExitCodes.JGI_FAILURE

    else:
        log.info("- No blast hits for %s." % (db))

    return RQCExitCodes.JGI_SUCCESS
コード例 #7
0
def post_process(fastq, outDir, filteredFastq, status, log):
    ## obtain read counts from input and filtered fastq files and save the values to STATS_LIST_FILE_NAME file;
    ## compress the filtered fastq file
    log_and_print(
        "\n\n%s - RUN POST PROCESS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n"
        % (color['pink'], color['']))
    if STEP_ORDER[status] < STEP_ORDER[POST_END]:
        checkpoint(POST_START, status)
        rawCnt = 0
        rawBaseCnt = 0
        newCnt = 0
        newBaseCnt = 0

        stats = get_dict_obj(BB_STATS_LIST_FILE_NAME)
        rawCnt = pipeline_val('inputReads', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)
        rawBaseCnt = pipeline_val('inputBases', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)
        newCnt = pipeline_val('outputReads', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)
        newBaseCnt = pipeline_val('outputBases', {
            'type': 'int',
            'vtype': 'numeric'
        }, stats)

        readCounts = {}

        readRmPct = 100.0 * ((rawCnt - newCnt) / float(rawCnt))
        baseRmPct = 100.0 * ((rawBaseCnt - newBaseCnt) / float(rawBaseCnt))
        readCounts['readRmPct'] = '%.3f' % readRmPct
        readCounts['baseRmPct'] = '%.3f' % baseRmPct

        refStats = {}
        filterLogStat = {}

        cardinality = None
        bbdukVersion = None
        bbmapVersion = None

        if os.path.isfile("filter.log"):

            with open(os.path.join(outDir, "filter.log"), "r") as FLFH:
                isContamNumChecked = False  ## Contamination will be done twice for removeribo or for MTF
                isKtrimmedTotalRemovedNumChecked = False  ## for parsing "Total Removed" after ktrimming

                for l in FLFH:
                    if l.startswith("Input:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 2
                        if 'adaptertriminput' not in filterLogStat:
                            filterLogStat["adaptertriminput"] = {
                                "numreads": toks[0],
                                "numbases": toks[1]
                            }
                        elif 'contamtriminput' not in filterLogStat:
                            filterLogStat["contamtriminput"] = {
                                "numreads": toks[0],
                                "numbases": toks[1]
                            }

                    elif l.startswith("FTrimmed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ftrimmed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                    elif l.startswith("KTrimmed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ktrimmed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                        isKtrimmedTotalRemovedNumChecked = True

                    ## RQCSUPPORT-1987
                    elif l.startswith("Total Removed:"
                                      ) and isKtrimmedTotalRemovedNumChecked:
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ktrimmed_total_removed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                        isKtrimmedTotalRemovedNumChecked = False

                    elif l.startswith("Trimmed by overlap:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["trimmedbyoverlap"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    elif l.startswith("Result:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        if 'adaptertrimresult' not in filterLogStat:
                            filterLogStat["adaptertrimresult"] = {
                                "numreads": toks[0],
                                "percreads": toks[1],
                                "numbases": toks[2],
                                "percbases": toks[3]
                            }
                        elif 'contamtrimresult' not in filterLogStat:
                            filterLogStat["contamtrimresult"] = {
                                "numreads": toks[0],
                                "percreads": toks[1],
                                "numbases": toks[2],
                                "percbases": toks[3]
                            }

                    elif l.startswith("Unique 31-mers:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 2 or len(toks) == 1
                        if 'adaptertrimunique31mers' not in filterLogStat:
                            if len(toks) == 2:
                                filterLogStat["adaptertrimunique31mers"] = {
                                    "num": toks[1]
                                }
                            else:
                                filterLogStat["adaptertrimunique31mers"] = {
                                    "num": "0"
                                }
                        else:
                            if len(toks) == 2:
                                filterLogStat["contamtrimunique31mers"] = {
                                    "num": toks[1]
                                }
                            else:
                                filterLogStat["contamtrimunique31mers"] = {
                                    "num": "0"
                                }

                    elif not isContamNumChecked and l.startswith(
                            "Contaminants:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["contaminants"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                        isContamNumChecked = True

                    elif l.startswith("QTrimmed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["qtrimmed"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    elif l.startswith("Short Read Discards:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["shortreaddiscards"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                    elif l.startswith("Low quality discards:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["lowqualitydiscards"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    elif l.startswith("BBDuk version"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 1
                        bbdukVersion = toks[0]
                    elif l.startswith("BBMap version"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 1
                        bbmapVersion = toks[0]

                    ## BBDuk 36.12 06272016
                    elif l.startswith("Adapter Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["adaptersequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }
                    elif l.startswith("Synthetic Contam Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["syntheticcontamsequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    ## 08112016
                    elif l.startswith(
                            "Short Synthetic Contam Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat[
                            "shortsyntheticcontamsequenceremoved"] = {
                                "numreads": toks[0],
                                "percreads": toks[1],
                                "numbases": toks[2],
                                "percbases": toks[3]
                            }

                    elif l.startswith("Ribosomal Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["ribosomalsequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    ## BBMap 36.12 06272016
                    elif l.startswith("Human Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["humansequenceremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

                    ## RQC-862, RQC-880
                    elif l.startswith("Microbial Sequence Removed:"):
                        toks = re.findall("(\d+.\d*)", l.rstrip())
                        assert len(toks) == 4
                        filterLogStat["microbialremoved"] = {
                            "numreads": toks[0],
                            "percreads": toks[1],
                            "numbases": toks[2],
                            "percbases": toks[3]
                        }

        ##
        ## refStats.txt format
        ##
        ## name %unambiguousReads   unambiguousMB   %ambiguousReads ambiguousMB unambiguousReads    ambiguousReads
        ## human_masked 85.24693    498.92052   0.09378 0.55290 3350692 3686
        ## mouse_masked 0.03765 0.21670 0.10802 0.63690 1480    4246
        ## cat_masked   0.01862 0.09568 0.02514 0.14820 732 988
        ## dog_masked   0.00697 0.03815 0.01384 0.08160 274 544
        ##
        if os.path.isfile("refStats.txt"):
            refStatsFile = os.path.join(outDir, "refStats.txt")
            with open(refStatsFile) as RFH:
                ## Need to report 0 if nothing matched
                refStats['human'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }
                refStats['cat'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }
                refStats['dog'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }
                refStats['mouse'] = {
                    "unambiguousReadsPerc": "0",
                    "unambiguousMB": "0",
                    "ambiguousReadsPerc": "0",
                    "ambiguousMB": "0",
                    "unambiguousReads": "0",
                    "ambiguousReads": "0",
                    "totalPerc": "0"
                }

                for l in RFH:
                    if l:
                        if l.startswith("#"):
                            continue

                        toks = l.rstrip().split()
                        assert len(toks) == 7

                        ## the number and percent of reads that map unambiguously or ambiguously to human, cat, dog.
                        ## take the sum of the two numbers (ambiguous plus unambiguous) to use as the final percentage.
                        if l.startswith("human"):
                            refStats['human'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }
                        if l.startswith("cat"):
                            refStats['cat'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }
                        if l.startswith("dog"):
                            refStats['dog'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }
                        if l.startswith("mouse"):
                            refStats['mouse'] = {
                                "unambiguousReadsPerc": toks[1],
                                "unambiguousMB": toks[2],
                                "ambiguousReadsPerc": toks[3],
                                "ambiguousMB": toks[4],
                                "unambiguousReads": toks[5],
                                "ambiguousReads": toks[6],
                                "totalPerc": float(toks[3]) + float(toks[1])
                            }

            log.debug("refStats.txt: %s", str(refStats))

        ###########################################################
        log_and_print("Write to stats file %s" % STATS_LIST_FILE_NAME)
        ###########################################################
        if os.path.isfile(STATS_LIST_FILE_NAME):
            os.remove(STATS_LIST_FILE_NAME)

        with open(BB_STATS_LIST_FILE_NAME) as bbfh:
            with open(STATS_LIST_FILE_NAME, 'a') as fh:
                for line in bbfh:
                    if not line.startswith("#") and line.strip():
                        fh.write(line)

        bbtoolsVersion = None

        stats = get_dict_obj(STATS_LIST_FILE_NAME)
        with open(STATS_LIST_FILE_NAME, 'a') as fh:
            for key in readCounts:
                if key not in stats:
                    write_stats(fh, key, readCounts[key], log)

            for key in refStats:
                for k in refStats[key]:
                    write_stats(fh, key + '_' + k, refStats[key][k], log)

            write_stats(fh, "cardinality", cardinality, log)

            ## Write refStats to filterStats.txt file
            for key in filterLogStat:
                for k in filterLogStat[key]:
                    write_stats(fh, key + '_' + k, filterLogStat[key][k], log)

            bbversionCmd = os.path.join(BBDIR, 'bbversion.sh')
            cmd = "%s" % (bbversionCmd)
            stdOut, _, exitCode = run_sh_command(cmd, True, log)
            assert stdOut is not None
            bbtoolsVersion = stdOut.strip()

            ## 05112017 Now bbtools version = bbmap version
            # bbtoolsVersion = bbmapVersion if bbmapVersion else "37.xx"
            assert bbtoolsVersion is not None
            write_stats(fh, "filter_tool", "bbtools " + bbtoolsVersion, log)
            write_stats(fh, "filter", VERSION, log)

            ## Version recording
            if bbdukVersion is None: bbdukVersion = bbtoolsVersion
            if bbmapVersion is None: bbmapVersion = bbtoolsVersion
            write_stats(fh, "bbduk_version", bbdukVersion, log)
            write_stats(fh, "bbmap_version", bbmapVersion, log)

        checkpoint(POST_END, status)
        status = POST_END
    else:
        log_and_print('No need to do post processing.')

    return filteredFastq, status
コード例 #8
0
def run_rqcfilter(infastq, outDir, prodType, status, enableRmoveMicrobes,
                  enableAggressive, disableRmoveMicrobes, disableClumpify,
                  taxList, rdb, log):
    log_and_print(
        "\n\n%s - RUN RQCFILTER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n"
        % (color['pink'], color['']))

    make_dir_p(outDir)
    opt = None

    extraOptions = ""
    optionFile = ""

    if prodType.endswith("-OLD"):
        extraOptions = " barcodefilter=f chastityfilter=f"
        prodType = prodType.replace("-OLD", "")

    if prodType in FILTER_METHODS_TXT:
        optionFile = os.path.join(
            SRC_ROOT, "filter_param/",
            FILTER_METHODS_TXT[prodType].replace(".txt", ".config"))
        log_and_print("Filter option file: %s" % optionFile)
        opt = open(optionFile, 'r').readline().rstrip()
    else:
        log_and_print("The product type, %s, is not supported yet." % prodType)
        sys.exit(2)

    assert (opt), "Null filter options."

    if prodType in ("METATRANSCRIPTOME", "MTF"):
        if infastq.endswith(".gz"):
            opt += " outribo=%s " % (os.path.basename(infastq).replace(
                ".fastq", ".rRNA.fastq"))
        else:
            opt += " outribo=%s " % (os.path.basename(infastq).replace(
                ".fastq", ".rRNA.fastq.gz"))

    if enableRmoveMicrobes:
        if opt.find("removemicrobes=f") != -1:
            opt = opt.replace("removemicrobes=f", "removemicrobes=t")
        opt += " removemicrobes=t "

    if disableRmoveMicrobes:
        if opt.find("removemicrobes=t") != -1:
            opt = opt.replace("removemicrobes=t", "removemicrobes=f")
        else:
            opt += " removemicrobes=f "

    if enableAggressive:
        opt += " aggressive=t microbebuild=3 "

    if taxList:
        opt += " taxlist=%s " % (taxList)

    ## Temp set clumpify=t for all prod types (RQC-890)
    if not disableClumpify:
        opt += " clumpify=t "
    else:
        opt += " clumpify=f "

    opt += " tmpdir=null "
    opt += extraOptions

    cmd = os.path.join(BBDIR, "rqcfilter.sh")
    filterLogFile = os.path.join(outDir, "filter.log")
    cmdStr = "%s in=%s path=%s %s usejni=f rqcfilterdata=%s > %s 2>&1" % (
        cmd, infastq, outDir, opt, rdb, filterLogFile)

    rtn = [None, status]
    outFastqFile = None

    shFileName = "%s/filter.sh" % outDir

    def find_filtered_fastq(adir):
        outFastqFile = None

        searchPatt = os.path.join(adir, "*.fastq.gz")
        outFastqFileList = glob.glob(searchPatt)

        assert len(outFastqFileList
                   ) >= 1, "ERROR: cannot find *.fastq.gz output file."
        for f in outFastqFileList:
            f = os.path.basename(f)
            t = f.split(".")

            if t[-3] not in ("frag", "singleton", "unknown", "rRNA", "lmp"):
                filterCode = t[-3]
            elif t[-3] == "lmp":  ## nextera
                filterCode = t[-4]

            if len(t) == 7:  ## ex) 12345.1.1234.ACCCC.anqdpht.fastq.gz
                fileNamePrefix = '.'.join(t[:4])
            elif len(t) == 6:  ## ex) 6176.5.39297.anqrpht.fastq.gz
                fileNamePrefix = '.'.join(t[:3])
            else:
                log.warning("Unexpected filtered file name, %s",
                            outFastqFileList)
                fileNamePrefix = '.'.join(t[:-3])
                log_and_print("Use %s as file prefix." % fileNamePrefix)

        assert filterCode and fileNamePrefix, "ERROR: unexpected filter file name: %s" % (
            outFastqFileList)

        of = os.path.join(adir,
                          '.'.join([fileNamePrefix, filterCode, "fastq.gz"]))
        lof = os.path.join(
            adir, '.'.join([fileNamePrefix, filterCode, "lmp.fastq.gz"]))
        if os.path.isfile(of):
            outFastqFile = of
        elif os.path.isfile(lof):
            outFastqFile = lof
        else:
            log.error("Cannot find fastq.gz file.")

        # rename output file to *.filtered.fastq.gz
        f = os.path.basename(outFastqFile)
        t = f.split(".")
        if t[-3] != 'filtered':
            fto = os.path.join(
                adir, '.'.join(['.'.join(t[:-3]), 'filtered', "fastq.gz"]))
            shutil.move(outFastqFile, fto)
            outFastqFile = fto

        return outFastqFile

    def find_filter_number(outFastqFile):
        filteredReadNum = fastqUtil.check_fastq_format(outFastqFile) / 4

        if filteredReadNum < 0:
            log_and_print("RUN RQCFILTER - filtered fastq format error: %s." %
                          outFastqFile)
        return filteredReadNum

    if STEP_ORDER[status] < STEP_ORDER[RQCFILTER_END]:

        create_shell(shFileName, (cmdStr, ))

        log_and_print("rqcfilter cmd=[%s]" % cmdStr)
        log_and_print("sh file name=[%s]" % shFileName)

        stdOut, stdErr, exitCode = run_sh_command(
            shFileName, True, log, True)  ## stdOut of 0 is success

        if exitCode != 0:
            log.error("Failed to run : %s, stdout : %s, stderr: %s",
                      shFileName, stdOut, stdErr)
            return rtn

        outFastqFile = find_filtered_fastq(outDir)

        filteredReadNum = find_filter_number(outFastqFile)
        log_and_print("Read counts after RQCFILTER step = %d" %
                      filteredReadNum)

        log_and_print("RUN RQCFILTER - completed")
        checkpoint(RQCFILTER_END, status)
        status = RQCFILTER_END

        if filteredReadNum == 0:
            log.warning("No reads left after filtering")
            checkpoint(PIPE_COMPLETE, status)
            with open(BB_STATS_LIST_FILE_NAME, 'a') as fh:
                write_stats(fh, FILTER_READ_COUNT, 0, log)
                write_stats(fh, FILTER_READ_BASE_COUNT, 0, log)
    else:
        log_and_print(
            "No need to rerun RQCFILTER step, get filtered files and stats ... "
        )
        outFastqFile = find_filtered_fastq(outDir)

    rtn = [outFastqFile, status]
    return rtn