コード例 #1
0
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    app_logfile = "%s/app.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    failedtagfile = "%s/runjob.failed" % (outpath)
    query_parafile = "%s/query.para.txt" % (outpath)

    query_para = ""
    content = myfunc.ReadFile(query_parafile)
    if content != "":
        query_para = json.loads(content)

    rmsg = ""

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)

    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    resultfile_text = "%s/%s" % (outpath_result, "query.result.txt")
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_seq_file = "%s/finished_seqs.txt" % (outpath_result)

    for folder in [outpath_result, tmp_outpath_result]:
        try:
            os.makedirs(folder)
        except OSError:
            msg = "Failed to create folder %s" % (folder)
            myfunc.WriteFile(msg + "\n", gen_errfile, "a")
            return 1
    try:
        open(finished_seq_file, 'w').close()
    except:
        pass
#first getting result from caches
# ==================================

    maplist = []
    maplist_simple = []
    toRunDict = {}
    hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
    if hdl.failure:
        isOK = False
    else:
        webcom.WriteDateTimeTagFile(starttagfile, runjob_logfile,
                                    runjob_errfile)
        recordList = hdl.readseq()
        cnt = 0
        origpath = os.getcwd()
        while recordList != None:
            for rd in recordList:
                isSkip = False
                # temp outpath for the sequence is always seq_0, and I feed
                # only one seq a time to the workflow
                tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result,
                                                  "seq_%d" % 0)
                outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % cnt)
                subfoldername_this_seq = "seq_%d" % (cnt)
                if os.path.exists(tmp_outpath_this_seq):
                    try:
                        shutil.rmtree(tmp_outpath_this_seq)
                    except OSError:
                        pass

                maplist.append(
                    "%s\t%d\t%s\t%s" %
                    ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq))
                maplist_simple.append(
                    "%s\t%d\t%s" %
                    ("seq_%d" % cnt, len(rd.seq), rd.description))
                if not g_params['isForceRun']:
                    md5_key = hashlib.md5(
                        (rd.seq +
                         str(query_para)).encode('utf-8')).hexdigest()
                    subfoldername = md5_key[:2]
                    cachedir = "%s/%s/%s" % (path_cache, subfoldername,
                                             md5_key)
                    zipfile_cache = cachedir + ".zip"

                    if os.path.exists(cachedir) or os.path.exists(
                            zipfile_cache):
                        if os.path.exists(cachedir):
                            try:
                                shutil.copytree(cachedir, outpath_this_seq)
                            except Exception as e:
                                msg = "Failed to copytree  %s -> %s" % (
                                    cachedir, outpath_this_seq)
                                date_str = time.strftime(FORMAT_DATETIME)
                                myfunc.WriteFile(
                                    "[%s] %s with errmsg=%s\n" %
                                    (date_str, msg, str(e)), runjob_errfile,
                                    "a")
                        elif os.path.exists(zipfile_cache):
                            cmd = [
                                "unzip", zipfile_cache, "-d", outpath_result
                            ]
                            webcom.RunCmd(cmd, runjob_logfile, runjob_errfile)
                            shutil.move("%s/%s" % (outpath_result, md5_key),
                                        outpath_this_seq)

                        if os.path.exists(outpath_this_seq):
                            info_finish = webcom.GetInfoFinish_PRODRES(
                                outpath_this_seq,
                                cnt,
                                len(rd.seq),
                                rd.description,
                                source_result="cached",
                                runtime=0.0)
                            myfunc.WriteFile("\t".join(info_finish) + "\n",
                                             finished_seq_file,
                                             "a",
                                             isFlush=True)
                            isSkip = True

                if not isSkip:
                    # first try to delete the outfolder if exists
                    if os.path.exists(outpath_this_seq):
                        try:
                            shutil.rmtree(outpath_this_seq)
                        except OSError:
                            pass
                    origIndex = cnt
                    numTM = 0
                    toRunDict[origIndex] = [rd.seq, numTM, rd.description
                                            ]  #init value for numTM is 0

                cnt += 1
            recordList = hdl.readseq()
        hdl.close()
    myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile)

    if not g_params['isOnlyGetCache']:
        torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa")
        dumplist = []
        for key in toRunDict:
            top = toRunDict[key][0]
            dumplist.append(">%s\n%s" % (str(key), top))
        myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w")
        del dumplist

        sortedlist = sorted(list(toRunDict.items()),
                            key=lambda x: x[1][1],
                            reverse=True)
        #format of sortedlist [(origIndex: [seq, numTM, description]), ...]

        # submit sequences one by one to the workflow according to orders in
        # sortedlist

        for item in sortedlist:
            origIndex = item[0]
            seq = item[1][0]
            description = item[1][2]

            subfoldername_this_seq = "seq_%d" % (origIndex)
            outpath_this_seq = "%s/%s" % (outpath_result,
                                          subfoldername_this_seq)
            tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" %
                                              (0))
            if os.path.exists(tmp_outpath_this_seq):
                try:
                    shutil.rmtree(tmp_outpath_this_seq)
                except OSError:
                    pass

            seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" %
                                          (origIndex))
            seqcontent = ">query_%d\n%s\n" % (origIndex, seq)
            myfunc.WriteFile(seqcontent, seqfile_this_seq, "w")

            if not os.path.exists(seqfile_this_seq):
                msg = "failed to generate seq index %d" % (origIndex)
                date_str = time.strftime(g_params['FORMAT_DATETIME'])
                myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile,
                                 "a", True)
                continue

            cmd = [
                "python", runscript, "--input", seqfile_this_seq, "--output",
                tmp_outpath_this_seq, "--pfam-dir", path_pfamdatabase,
                "--pfamscan-script", path_pfamscanscript,
                "--fallback-db-fasta", blastdb
            ]

            if 'second_method' in query_para and query_para[
                    'second_method'] != "":
                cmd += ['--second-search', query_para['second_method']]

            if 'pfamscan_evalue' in query_para and query_para[
                    'pfamscan_evalue'] != "":
                cmd += ['--pfamscan_e-val', query_para['pfamscan_evalue']]
            elif 'pfamscan_bitscore' in query_para and query_para[
                    'pfamscan_bitscore'] != "":
                cmd += ['--pfamscan_bitscore', query_para['pfamscan_bitscore']]

            if 'pfamscan_clanoverlap' in query_para:
                if query_para['pfamscan_clanoverlap'] == False:
                    cmd += ['--pfamscan_clan-overlap', 'no']
                else:
                    cmd += ['--pfamscan_clan-overlap', 'yes']

            if 'jackhmmer_iteration' in query_para and query_para[
                    'jackhmmer_iteration'] != "":
                cmd += [
                    '--jackhmmer_max_iter', query_para['jackhmmer_iteration']
                ]

            if 'jackhmmer_threshold_type' in query_para and query_para[
                    'jackhmmer_threshold_type'] != "":
                cmd += [
                    '--jackhmmer-threshold-type',
                    query_para['jackhmmer_threshold_type']
                ]

            if 'jackhmmer_evalue' in query_para and query_para[
                    'jackhmmer_evalue'] != "":
                cmd += ['--jackhmmer_e-val', query_para['jackhmmer_evalue']]
            elif 'jackhmmer_bitscore' in query_para and query_para[
                    'jackhmmer_bitscore'] != "":
                cmd += [
                    '--jackhmmer_bit-score', query_para['jackhmmer_bitscore']
                ]

            if 'psiblast_iteration' in query_para and query_para[
                    'psiblast_iteration'] != "":
                cmd += ['--psiblast_iter', query_para['psiblast_iteration']]
            if 'psiblast_outfmt' in query_para and query_para[
                    'psiblast_outfmt'] != "":
                cmd += ['--psiblast_outfmt', query_para['psiblast_outfmt']]

            (t_success,
             runtime_in_sec) = webcom.RunCmd(cmd, runjob_logfile,
                                             runjob_errfile, True)

            aaseqfile = "%s/seq.fa" % (tmp_outpath_this_seq + os.sep +
                                       "query_0")
            if not os.path.exists(aaseqfile):
                seqcontent = ">%s\n%s\n" % (description, seq)
                myfunc.WriteFile(seqcontent, aaseqfile, "w")

            if os.path.exists(tmp_outpath_this_seq):
                cmd = [
                    "mv", "-f", tmp_outpath_this_seq + os.sep + "query_0",
                    outpath_this_seq
                ]
                isCmdSuccess = False
                (isCmdSuccess,
                 t_runtime) = webcom.RunCmd(cmd, runjob_logfile,
                                            runjob_errfile, True)

                if not 'isKeepTempFile' in query_para or query_para[
                        'isKeepTempFile'] == False:
                    try:
                        temp_result_folder = "%s/temp" % (outpath_this_seq)
                        shutil.rmtree(temp_result_folder)
                    except:
                        msg = "Failed to delete the folder %s" % (
                            temp_result_folder)
                        date_str = time.strftime(g_params['FORMAT_DATETIME'])
                        myfunc.WriteFile("[%s] %s\n" % (date_str, msg),
                                         runjob_errfile, "a", True)

                    flist = [
                        "%s/outputs/%s" % (outpath_this_seq, "Alignment.txt"),
                        "%s/outputs/%s" % (outpath_this_seq, "tableOut.txt"),
                        "%s/outputs/%s" % (outpath_this_seq, "fullOut.txt")
                    ]
                    for f in flist:
                        if os.path.exists(f):
                            try:
                                os.remove(f)
                            except:
                                msg = "Failed to delete the file %s" % (f)
                                date_str = time.strftime(
                                    g_params['FORMAT_DATETIME'])
                                myfunc.WriteFile("[%s] %s\n" % (date_str, msg),
                                                 runjob_errfile, "a", True)

                if isCmdSuccess:
                    timefile = "%s/time.txt" % (outpath_this_seq)
                    runtime = webcom.ReadRuntimeFromFile(timefile,
                                                         default_runtime=0.0)
                    info_finish = webcom.GetInfoFinish_PRODRES(
                        outpath_this_seq,
                        origIndex,
                        len(seq),
                        description,
                        source_result="newrun",
                        runtime=runtime)
                    myfunc.WriteFile("\t".join(info_finish) + "\n",
                                     finished_seq_file,
                                     "a",
                                     isFlush=True)
                    # now write the text output for this seq

                    info_this_seq = "%s\t%d\t%s\t%s" % (
                        "seq_%d" % origIndex, len(seq), description, seq)
                    resultfile_text_this_seq = "%s/%s" % (outpath_this_seq,
                                                          "query.result.txt")
                    #webcom.WriteSubconsTextResultFile(resultfile_text_this_seq,
                    #        outpath_result, [info_this_seq], runtime_in_sec, g_params['base_www_url'])
                    # create or update the md5 cache
                    # create cache only on the front-end
                    if webcom.IsFrontEndNode(g_params['base_www_url']):
                        md5_key = hashlib.md5(
                            (seq +
                             str(query_para)).encode('utf-8')).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_subfolder = "%s/%s" % (path_cache, subfoldername)
                        cachedir = "%s/%s/%s" % (path_cache, subfoldername,
                                                 md5_key)

                        # copy the zipped folder to the cache path
                        origpath = os.getcwd()
                        os.chdir(outpath_result)
                        shutil.copytree("seq_%d" % (origIndex), md5_key)
                        cmd = ["zip", "-rq", "%s.zip" % (md5_key), md5_key]
                        webcom.RunCmd(cmd, runjob_logfile, runjob_logfile)
                        if not os.path.exists(md5_subfolder):
                            os.makedirs(md5_subfolder)
                        shutil.move("%s.zip" % (md5_key),
                                    "%s.zip" % (cachedir))
                        shutil.rmtree(
                            md5_key
                        )  # delete the temp folder named as md5 hash
                        os.chdir(origpath)

                        # Add the finished date to the database
                        date_str = time.strftime(FORMAT_DATETIME)
                        webcom.InsertFinishDateToDB(date_str, md5_key, seq,
                                                    finished_date_db)

    all_end_time = time.time()
    all_runtime_in_sec = all_end_time - all_begin_time

    if not g_params['isOnlyGetCache'] or len(toRunDict) == 0:
        # now write the text output to a single file
        statfile = "%s/%s" % (outpath_result, "stat.txt")
        #webcom.WriteSubconsTextResultFile(resultfile_text, outpath_result, maplist,
        #        all_runtime_in_sec, g_params['base_www_url'], statfile=statfile)

        # now making zip instead (for windows users)
        # note that zip rq will zip the real data for symbolic links
        os.chdir(outpath)
        #             cmd = ["tar", "-czf", tarball, resultpathname]
        cmd = ["zip", "-rq", zipfile, resultpathname]
        webcom.RunCmd(cmd, runjob_logfile, runjob_errfile)

        # write finish tag file
        if os.path.exists(finished_seq_file):
            webcom.WriteDateTimeTagFile(finishtagfile, runjob_logfile,
                                        runjob_errfile)

        isSuccess = False
        if (os.path.exists(finishtagfile)
                and os.path.exists(zipfile_fullpath)):
            isSuccess = True
        else:
            isSuccess = False
            webcom.WriteDateTimeTagFile(failedtagfile, runjob_logfile,
                                        runjob_errfile)

# send the result to email
# do not sendmail at the cloud VM
        if webcom.IsFrontEndNode(g_params['base_www_url']
                                 ) and myfunc.IsValidEmailAddress(email):
            if isSuccess:
                finish_status = "success"
            else:
                finish_status = "failed"
            webcom.SendEmail_on_finish(
                jobid,
                g_params['base_www_url'],
                finish_status,
                name_server="PRODRES",
                from_email="*****@*****.**",
                to_email=email,
                contact_email=contact_email,
                logfile=runjob_logfile,
                errfile=runjob_errfile)

    if os.path.exists(runjob_errfile) and os.path.getsize(runjob_errfile) > 1:
        return 1
    else:
        try:
            shutil.rmtree(tmpdir)
            msg = "rmtree(%s)" % (tmpdir)
            webcom.loginfo("rmtree(%s)" % (tmpdir), runjob_logfile)
        except Exception as e:
            msg = "Failed to rmtree(%s)" % (tmpdir)
            webcom.loginfo(
                "Failed to rmtree(%s) with error message: %s" %
                (tmpdir, str(e)), runjob_errfile)
        return 0
コード例 #2
0
def RunJob(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(infile)[0])
    starttagfile = "%s/runjob.start" % (outpath)
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    app_logfile = "%s/app.log" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    rmsg = ""

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)

    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    resultfile_text = "%s/%s" % (outpath_result, "query.top")
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_seq_file = "%s/finished_seqs.txt" % (outpath_result)
    finished_idx_file = "%s/finished_seqindex.txt" % (outpath)

    for folder in [outpath_result, tmp_outpath_result]:
        try:
            os.makedirs(folder)
        except OSError:
            msg = "Failed to create folder %s" % (folder)
            myfunc.WriteFile(msg + "\n", gen_errfile, "a")
            return 1

    try:
        open(finished_seq_file, 'w').close()
    except:
        pass
#first getting result from caches
# ==================================

    maplist = []
    maplist_simple = []
    toRunDict = {}
    hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
    if hdl.failure:
        isOK = False
    else:
        webcom.WriteDateTimeTagFile(starttagfile, runjob_logfile,
                                    runjob_errfile)

        recordList = hdl.readseq()
        cnt = 0
        origpath = os.getcwd()
        while recordList != None:
            for rd in recordList:
                isSkip = False
                # temp outpath for the sequence is always seq_0, and I feed
                # only one seq a time to the workflow
                tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result,
                                                  "seq_%d" % 0)
                outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % cnt)
                subfoldername_this_seq = "seq_%d" % (cnt)
                if os.path.exists(tmp_outpath_this_seq):
                    try:
                        shutil.rmtree(tmp_outpath_this_seq)
                    except OSError:
                        pass

                maplist.append(
                    "%s\t%d\t%s\t%s" %
                    ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq))
                maplist_simple.append(
                    "%s\t%d\t%s" %
                    ("seq_%d" % cnt, len(rd.seq), rd.description))
                if not g_params['isForceRun']:
                    md5_key = hashlib.md5(rd.seq.encode('utf-8')).hexdigest()
                    subfoldername = md5_key[:2]
                    cachedir = "%s/%s/%s" % (path_cache, subfoldername,
                                             md5_key)
                    zipfile_cache = cachedir + ".zip"
                    if os.path.exists(cachedir) or os.path.exists(
                            zipfile_cache):
                        if os.path.exists(cachedir):
                            try:
                                shutil.copytree(cachedir, outpath_this_seq)
                            except Exception as e:
                                msg = "Failed to copytree  %s -> %s" % (
                                    cachedir, outpath_this_seq)
                                date_str = time.strftime(FORMAT_DATETIME)
                                myfunc.WriteFile(
                                    "[%s] %s with errmsg=%s\n" %
                                    (date_str, msg, str(e)), runjob_errfile,
                                    "a")
                        elif os.path.exists(zipfile_cache):
                            cmd = [
                                "unzip", zipfile_cache, "-d", outpath_result
                            ]
                            webcom.RunCmd(cmd, runjob_logfile, runjob_errfile)
                            shutil.move("%s/%s" % (outpath_result, md5_key),
                                        outpath_this_seq)

                        checkfile = "%s/query.predict.png" % (outpath_this_seq)
                        fafile_this_seq = '%s/seq.fa' % (outpath_this_seq)

                        if os.path.exists(outpath_this_seq) and os.path.exists(
                                checkfile):
                            info_finish = webcom.GetInfoFinish_Boctopus2(
                                outpath_this_seq,
                                cnt,
                                len(rd.seq),
                                rd.description,
                                source_result="cached",
                                runtime=0.0)
                            myfunc.WriteFile("\t".join(info_finish) + "\n",
                                             finished_seq_file,
                                             "a",
                                             isFlush=True)
                            myfunc.WriteFile("%d\n" % (cnt),
                                             finished_idx_file,
                                             "a",
                                             isFlush=True)
                            isSkip = True

                if not isSkip:
                    # first try to delete the outfolder if exists
                    if os.path.exists(outpath_this_seq):
                        try:
                            shutil.rmtree(outpath_this_seq)
                        except OSError:
                            pass
                    origIndex = cnt
                    numTM = 0
                    toRunDict[origIndex] = [rd.seq, numTM, rd.description
                                            ]  #init value for numTM is 0

                cnt += 1
            recordList = hdl.readseq()
        hdl.close()
    myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile)

    if not g_params['isOnlyGetCache']:
        torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa")
        dumplist = []
        for key in toRunDict:
            top = toRunDict[key][0]
            dumplist.append(">%s\n%s" % (str(key), top))
        myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w")
        del dumplist

        sortedlist = sorted(list(toRunDict.items()),
                            key=lambda x: x[1][1],
                            reverse=True)
        #format of sortedlist [(origIndex: [seq, numTM, description]), ...]

        # submit sequences one by one to the workflow according to orders in
        # sortedlist

        for item in sortedlist:
            origIndex = item[0]
            seq = item[1][0]
            description = item[1][2]

            subfoldername_this_seq = "seq_%d" % (origIndex)
            outpath_this_seq = "%s/%s" % (outpath_result,
                                          subfoldername_this_seq)
            tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" %
                                              (0))
            if os.path.exists(tmp_outpath_this_seq):
                try:
                    shutil.rmtree(tmp_outpath_this_seq)
                except OSError:
                    pass

            seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" %
                                          (origIndex))
            seqcontent = ">query_%d\n%s\n" % (origIndex, seq)
            myfunc.WriteFile(seqcontent, seqfile_this_seq, "w")

            if not os.path.exists(seqfile_this_seq):
                msg = "Failed to generate seq file for index %d" % (origIndex)
                date_str = time.strftime(FORMAT_DATETIME)
                myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile,
                                 "a", True)
                continue

            cmd = [runscript, seqfile_this_seq, tmp_outpath_result]
            (t_success,
             runtime_in_sec) = webcom.RunCmd(cmd, runjob_logfile,
                                             runjob_errfile, True)

            aaseqfile = "%s/seq.fa" % (tmp_outpath_this_seq)
            if not os.path.exists(aaseqfile):
                seqcontent = ">%s\n%s\n" % (description, seq)
                myfunc.WriteFile(seqcontent, aaseqfile, "w")

            if os.path.exists(tmp_outpath_this_seq):
                cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq]
                (isCmdSuccess,
                 t_runtime) = webcom.RunCmd(cmd, runjob_logfile,
                                            runjob_errfile)
                timefile = "%s/time.txt" % (tmp_outpath_result)
                targetfile = "%s/time.txt" % (outpath_this_seq)
                if os.path.exists(timefile) and os.path.exists(
                        outpath_this_seq):
                    try:
                        shutil.move(timefile, targetfile)
                    except:
                        msg = "Failed to move %s/time.txt" % (
                            tmp_outpath_result)
                        date_str = time.strftime(FORMAT_DATETIME)
                        myfunc.WriteFile("[%s] %s\n" % (date_str, msg),
                                         runjob_errfile, "a", True)
                        pass

                if isCmdSuccess:
                    runtime = runtime_in_sec  #in seconds
                    info_finish = webcom.GetInfoFinish_Boctopus2(
                        outpath_this_seq,
                        origIndex,
                        len(seq),
                        description,
                        source_result="newrun",
                        runtime=runtime)
                    myfunc.WriteFile("\t".join(info_finish) + "\n",
                                     finished_seq_file,
                                     "a",
                                     isFlush=True)
                    # now write the text output for this seq

                    info_this_seq = "%s\t%d\t%s\t%s" % (
                        "seq_%d" % origIndex, len(seq), description, seq)
                    resultfile_text_this_seq = "%s/%s" % (outpath_this_seq,
                                                          "query.result.txt")
                    webcom.WriteBoctopusTextResultFile(
                        resultfile_text_this_seq, outpath_result,
                        [info_this_seq], runtime_in_sec,
                        g_params['base_www_url'])
                    # create or update the md5 cache
                    # create cache only on the front-end
                    figurefile = "%s/plot/query_0.png" % (outpath_this_seq)
                    # Note: do not create cache is figure file does not exist
                    if webcom.IsFrontEndNode(g_params['base_www_url']
                                             ) and os.path.exists(figurefile):
                        md5_key = hashlib.md5(seq.encode('utf-8')).hexdigest()
                        subfoldername = md5_key[:2]
                        md5_subfolder = "%s/%s" % (path_cache, subfoldername)
                        cachedir = "%s/%s/%s" % (path_cache, subfoldername,
                                                 md5_key)
                        if os.path.exists(cachedir):
                            try:
                                shutil.rmtree(cachedir)
                            except:
                                msg = "Failed to shutil.rmtree(%s)" % (
                                    cachedir)
                                date_str = time.strftime(FORMAT_DATETIME)
                                myfunc.WriteFile("[%s] %s\n" % (date_str, msg),
                                                 runjob_errfile, "a", True)
                                pass

                        if not os.path.exists(md5_subfolder):
                            try:
                                os.makedirs(md5_subfolder)
                            except:
                                pass

                        if os.path.exists(md5_subfolder
                                          ) and not os.path.exists(cachedir):
                            cmd = ["mv", "-f", outpath_this_seq, cachedir]
                            webcom.RunCmd(cmd, runjob_logfile, runjob_errfile)

                        if not os.path.exists(
                                outpath_this_seq) and os.path.exists(cachedir):
                            rela_path = os.path.relpath(
                                cachedir, outpath_result)  #relative path
                            try:
                                os.chdir(outpath_result)
                                os.symlink(rela_path, subfoldername_this_seq)
                            except:
                                pass

    all_end_time = time.time()
    all_runtime_in_sec = all_end_time - all_begin_time

    if not g_params['isOnlyGetCache'] or len(toRunDict) == 0:
        # now write the text output to a single file
        statfile = "%s/%s" % (outpath_result, "stat.txt")
        webcom.WriteBoctopusTextResultFile(resultfile_text,
                                           outpath_result,
                                           maplist,
                                           all_runtime_in_sec,
                                           g_params['base_www_url'],
                                           statfile=statfile)

        # now making zip instead (for windows users)
        # note that zip rq will zip the real data for symbolic links
        os.chdir(outpath)
        cmd = ["zip", "-rq", zipfile, resultpathname]
        webcom.RunCmd(cmd, runjob_logfile, runjob_errfile)

        # write finish tag file
        if os.path.exists(finished_seq_file):
            webcom.WriteDateTimeTagFile(finishtagfile, runjob_logfile,
                                        runjob_errfile)

        isSuccess = False
        if (os.path.exists(finishtagfile)
                and os.path.exists(zipfile_fullpath)):
            isSuccess = True
        else:
            isSuccess = False
            failedtagfile = "%s/runjob.failed" % (outpath)
            webcom.WriteDateTimeTagFile(failedtagfile, runjob_logfile,
                                        runjob_errfile)

# send the result to email
# do not sendmail at the cloud VM
        if webcom.IsFrontEndNode(g_params['base_www_url']
                                 ) and myfunc.IsValidEmailAddress(email):
            if isSuccess:
                finish_status = "success"
            else:
                finish_status = "failed"
            webcom.SendEmail_on_finish(
                jobid,
                g_params['base_www_url'],
                finish_status,
                name_server="BOCTOPUS2",
                from_email="[email protected])",
                to_email=email,
                contact_email=contact_email,
                logfile=runjob_logfile,
                errfile=runjob_errfile)

    if os.path.exists(runjob_errfile) and os.path.getsize(runjob_errfile) > 1:
        return 1
    else:
        date_str = time.strftime(FORMAT_DATETIME)
        try:
            shutil.rmtree(tmpdir)
            msg = "rmtree(%s)" % (tmpdir)
            myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_logfile,
                             "a", True)
        except Exception as e:
            msg = "Failed to rmtree(%s)" % (tmpdir)
            myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile,
                             "a", True)
            pass
        return 0
コード例 #3
0
ファイル: run_job.py プロジェクト: nanjiangshu/web_scampi2
def RunJob_msa(infile, outpath, tmpdir, email, jobid, g_params):  #{{{
    all_begin_time = time.time()

    rootname = os.path.basename(os.path.splitext(infile)[0])
    runjob_errfile = "%s/runjob.err" % (outpath)
    runjob_logfile = "%s/runjob.log" % (outpath)
    starttagfile = "%s/runjob.start" % (outpath)
    finishtagfile = "%s/runjob.finish" % (outpath)
    failtagfile = "%s/runjob.failed" % (outpath)
    rmsg = ""
    qdinit_start_tagfile = "%s/runjob.qdinit.start" % (outpath)

    # if the daemon starts to process the job before the run_job.py running
    # in the local queue, skip it
    if os.path.exists(qdinit_start_tagfile):
        return 0

    resultpathname = jobid

    outpath_result = "%s/%s" % (outpath, resultpathname)
    tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname)

    tarball = "%s.tar.gz" % (resultpathname)
    zipfile = "%s.zip" % (resultpathname)
    tarball_fullpath = "%s.tar.gz" % (outpath_result)
    zipfile_fullpath = "%s.zip" % (outpath_result)
    resultfile_text = "%s/%s" % (outpath_result, "query.result.txt")
    mapfile = "%s/seqid_index_map.txt" % (outpath_result)
    finished_seq_file = "%s/finished_seqs.txt" % (outpath_result)
    finished_idx_file = "%s/finished_seqindex.txt" % (outpath)

    for folder in [outpath_result, tmp_outpath_result]:
        try:
            os.makedirs(folder)
        except OSError:
            msg = "Failed to create folder %s" % (folder)
            myfunc.WriteFile(msg + "\n", gen_errfile, "a")
            return 1

    try:
        open(finished_seq_file, 'w').close()
    except:
        pass
#first getting result from caches
# ==================================
    maplist = []
    toRunDict = {}
    hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0)
    if hdl.failure:
        isOK = False
    else:
        webcom.WriteDateTimeTagFile(starttagfile, runjob_logfile,
                                    runjob_errfile)
        cnt = 0
        origpath = os.getcwd()
        con = sqlite3.connect(db_cache_SCAMPI2MSA)
        with con:
            cur = con.cursor()
            cur.execute("""
                CREATE TABLE IF NOT EXISTS %s
                (
                    md5 VARCHAR(100),
                    seq VARCHAR(30000),
                    top VARCHAR(30000),
                    PRIMARY KEY (md5)
                )""" % (dbmsa_tablename))
            recordList = hdl.readseq()
            while recordList != None:
                for rd in recordList:
                    isSkip = False
                    if not g_params['isForceRun']:
                        md5_key = hashlib.md5(
                            rd.seq.encode('utf-8')).hexdigest()
                        cmd = "SELECT md5, seq, top FROM %s WHERE md5 =  \"%s\"" % (
                            dbmsa_tablename, md5_key)
                        cur.execute(cmd)
                        rows = cur.fetchall()
                        for row in rows:
                            top = row[2]
                            numTM = myfunc.CountTM(top)
                            # info_finish has 8 items
                            info_finish = [
                                "seq_%d" % cnt,
                                str(len(rd.seq)),
                                str(numTM), "cached",
                                str(0.0), rd.description, rd.seq, top
                            ]
                            myfunc.WriteFile("\t".join(info_finish) + "\n",
                                             finished_seq_file,
                                             "a",
                                             isFlush=True)
                            myfunc.WriteFile("%d\n" % (cnt),
                                             finished_idx_file,
                                             "a",
                                             isFlush=True)
                            isSkip = True

                    if not isSkip:
                        # first try to delete the outfolder if exists
                        origIndex = cnt
                        numTM = 0
                        toRunDict[origIndex] = [rd.seq, numTM, rd.description
                                                ]  #init value for numTM is 0
                    cnt += 1
                recordList = hdl.readseq()
            hdl.close()

    if not g_params['isOnlyGetCache']:
        torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa")
        dumplist = []
        for key in toRunDict:
            top = toRunDict[key][0]
            dumplist.append(">%s\n%s" % (str(key), top))
        myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w")
        del dumplist
        sortedlist = sorted(list(toRunDict.items()),
                            key=lambda x: x[1][1],
                            reverse=True)
        #format of sortedlist [(origIndex: [seq, numTM, description]), ...]
        # submit sequences one by one to the workflow according to orders in
        # sortedlist
        for item in sortedlist:
            origIndex = item[0]
            seq = item[1][0]
            description = item[1][2]

            outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex)
            tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" %
                                              (0))
            if os.path.exists(tmp_outpath_this_seq):
                try:
                    shutil.rmtree(tmp_outpath_this_seq)
                except OSError:
                    pass
            try:
                os.makedirs(tmp_outpath_this_seq)
            except OSError:
                g_params['runjob_err'].append(
                    "Failed to create the tmp_outpath_this_seq %s" %
                    (tmp_outpath_this_seq))
                continue

            seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" %
                                          (origIndex))
            seqcontent = ">%d\n%s\n" % (origIndex, seq)
            myfunc.WriteFile(seqcontent, seqfile_this_seq, "w")

            if not os.path.exists(seqfile_this_seq):
                g_params['runjob_err'].append(
                    "failed to generate seq index %d" % (origIndex))
                continue

            if not os.path.exists("%s/seq.fa" % (tmp_outpath_this_seq)):
                try:
                    shutil.copyfile(seqfile_this_seq,
                                    "%s/seq.fa" % (tmp_outpath_this_seq))
                except OSError:
                    pass

            numCPU = 4
            outtopfile = "%s/query.top" % (tmp_outpath_this_seq)
            cmd = [
                runscript_msa, seqfile_this_seq, outtopfile, blastdir, blastdb
            ]
            (t_success, runtime_in_sec) = webcom.RunCmd(cmd,
                                                        runjob_logfile,
                                                        runjob_errfile,
                                                        verbose=True)

            if os.path.exists(tmp_outpath_this_seq):
                cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq]
                (isCmdSuccess, t_runtime) = webcom.RunCmd(cmd,
                                                          runjob_logfile,
                                                          runjob_errfile,
                                                          verbose=True)

                if isCmdSuccess:
                    runtime = runtime_in_sec  #in seconds
                    predfile = "%s/query.top" % (outpath_this_seq)
                    (seqid, seqanno, top) = myfunc.ReadSingleFasta(predfile)
                    numTM = myfunc.CountTM(top)
                    # info_finish has 8 items
                    info_finish = [
                        "seq_%d" % origIndex,
                        str(len(seq)),
                        str(numTM), "newrun",
                        str(runtime), description, seq, top
                    ]
                    myfunc.WriteFile("\t".join(info_finish) + "\n",
                                     finished_seq_file,
                                     "a",
                                     isFlush=True)

    all_end_time = time.time()
    all_runtime_in_sec = all_end_time - all_begin_time

    if len(g_params['runjob_log']) > 0:
        rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_log']) + "\n",
                                  runjob_logfile, "a")
        if rt_msg:
            g_params['runjob_err'].append(rt_msg)

    if not g_params['isOnlyGetCache'] or len(toRunDict) == 0:
        if os.path.exists(finished_seq_file):
            webcom.WriteDateTimeTagFile(finishtagfile, runjob_logfile,
                                        runjob_errfile)

# now write the text output to a single file
        dumped_resultfile = "%s/%s" % (outpath_result, "query.top")
        statfile = "%s/%s" % (outpath_result, "stat.txt")
        webcom.WriteSCAMPI2MSATextResultFile(dumped_resultfile,
                                             outpath_result,
                                             maplist,
                                             all_runtime_in_sec,
                                             g_params['base_www_url'],
                                             statfile=statfile)

        # now making zip instead (for windows users)
        pwd = os.getcwd()
        os.chdir(outpath)
        cmd = ["zip", "-rq", zipfile, resultpathname]
        webcom.RunCmd(cmd, runjob_logfile, runjob_errfile)
        os.chdir(pwd)

        isSuccess = False
        if (os.path.exists(finishtagfile)
                and os.path.exists(zipfile_fullpath)):
            isSuccess = True
            # delete the tmpdir if succeeded
            shutil.rmtree(tmpdir)  #DEBUG, keep tmpdir
        else:
            isSuccess = False
            webcom.WriteDateTimeTagFile(failtagfile, runjob_logfile,
                                        runjob_errfile)

        finish_status = ""  #["success", "failed", "partly_failed"]
        if isSuccess:
            finish_status = "success"
        else:
            finish_status = "failed"

# send the result to email
# do not sendmail at the cloud VM
        if webcom.IsFrontEndNode(g_params['base_www_url']
                                 ) and myfunc.IsValidEmailAddress(email):
            webcom.SendEmail_on_finish(jobid,
                                       g_params['base_www_url'],
                                       finish_status,
                                       name_server="SCAMPI2-msa",
                                       from_email="*****@*****.**",
                                       to_email=email,
                                       contact_email=contact_email,
                                       logfile=runjob_logfile,
                                       errfile=runjob_errfile)
    return 0