def RunJob(infile, outpath, tmpdir, email, jobid, g_params): #{{{ all_begin_time = time.time() rootname = os.path.basename(os.path.splitext(infile)[0]) starttagfile = "%s/runjob.start" % (outpath) runjob_errfile = "%s/runjob.err" % (outpath) runjob_logfile = "%s/runjob.log" % (outpath) app_logfile = "%s/app.log" % (outpath) finishtagfile = "%s/runjob.finish" % (outpath) failedtagfile = "%s/runjob.failed" % (outpath) query_parafile = "%s/query.para.txt" % (outpath) query_para = "" content = myfunc.ReadFile(query_parafile) if content != "": query_para = json.loads(content) rmsg = "" resultpathname = jobid outpath_result = "%s/%s" % (outpath, resultpathname) tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname) tarball = "%s.tar.gz" % (resultpathname) zipfile = "%s.zip" % (resultpathname) tarball_fullpath = "%s.tar.gz" % (outpath_result) zipfile_fullpath = "%s.zip" % (outpath_result) resultfile_text = "%s/%s" % (outpath_result, "query.result.txt") mapfile = "%s/seqid_index_map.txt" % (outpath_result) finished_seq_file = "%s/finished_seqs.txt" % (outpath_result) for folder in [outpath_result, tmp_outpath_result]: try: os.makedirs(folder) except OSError: msg = "Failed to create folder %s" % (folder) myfunc.WriteFile(msg + "\n", gen_errfile, "a") return 1 try: open(finished_seq_file, 'w').close() except: pass #first getting result from caches # ================================== maplist = [] maplist_simple = [] toRunDict = {} hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0) if hdl.failure: isOK = False else: webcom.WriteDateTimeTagFile(starttagfile, runjob_logfile, runjob_errfile) recordList = hdl.readseq() cnt = 0 origpath = os.getcwd() while recordList != None: for rd in recordList: isSkip = False # temp outpath for the sequence is always seq_0, and I feed # only one seq a time to the workflow tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % 0) outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % cnt) subfoldername_this_seq = "seq_%d" % (cnt) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass maplist.append( "%s\t%d\t%s\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq)) maplist_simple.append( "%s\t%d\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description)) if not g_params['isForceRun']: md5_key = hashlib.md5( (rd.seq + str(query_para)).encode('utf-8')).hexdigest() subfoldername = md5_key[:2] cachedir = "%s/%s/%s" % (path_cache, subfoldername, md5_key) zipfile_cache = cachedir + ".zip" if os.path.exists(cachedir) or os.path.exists( zipfile_cache): if os.path.exists(cachedir): try: shutil.copytree(cachedir, outpath_this_seq) except Exception as e: msg = "Failed to copytree %s -> %s" % ( cachedir, outpath_this_seq) date_str = time.strftime(FORMAT_DATETIME) myfunc.WriteFile( "[%s] %s with errmsg=%s\n" % (date_str, msg, str(e)), runjob_errfile, "a") elif os.path.exists(zipfile_cache): cmd = [ "unzip", zipfile_cache, "-d", outpath_result ] webcom.RunCmd(cmd, runjob_logfile, runjob_errfile) shutil.move("%s/%s" % (outpath_result, md5_key), outpath_this_seq) if os.path.exists(outpath_this_seq): info_finish = webcom.GetInfoFinish_PRODRES( outpath_this_seq, cnt, len(rd.seq), rd.description, source_result="cached", runtime=0.0) myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) isSkip = True if not isSkip: # first try to delete the outfolder if exists if os.path.exists(outpath_this_seq): try: shutil.rmtree(outpath_this_seq) except OSError: pass origIndex = cnt numTM = 0 toRunDict[origIndex] = [rd.seq, numTM, rd.description ] #init value for numTM is 0 cnt += 1 recordList = hdl.readseq() hdl.close() myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile) if not g_params['isOnlyGetCache']: torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa") dumplist = [] for key in toRunDict: top = toRunDict[key][0] dumplist.append(">%s\n%s" % (str(key), top)) myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w") del dumplist sortedlist = sorted(list(toRunDict.items()), key=lambda x: x[1][1], reverse=True) #format of sortedlist [(origIndex: [seq, numTM, description]), ...] # submit sequences one by one to the workflow according to orders in # sortedlist for item in sortedlist: origIndex = item[0] seq = item[1][0] description = item[1][2] subfoldername_this_seq = "seq_%d" % (origIndex) outpath_this_seq = "%s/%s" % (outpath_result, subfoldername_this_seq) tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % (0)) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" % (origIndex)) seqcontent = ">query_%d\n%s\n" % (origIndex, seq) myfunc.WriteFile(seqcontent, seqfile_this_seq, "w") if not os.path.exists(seqfile_this_seq): msg = "failed to generate seq index %d" % (origIndex) date_str = time.strftime(g_params['FORMAT_DATETIME']) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile, "a", True) continue cmd = [ "python", runscript, "--input", seqfile_this_seq, "--output", tmp_outpath_this_seq, "--pfam-dir", path_pfamdatabase, "--pfamscan-script", path_pfamscanscript, "--fallback-db-fasta", blastdb ] if 'second_method' in query_para and query_para[ 'second_method'] != "": cmd += ['--second-search', query_para['second_method']] if 'pfamscan_evalue' in query_para and query_para[ 'pfamscan_evalue'] != "": cmd += ['--pfamscan_e-val', query_para['pfamscan_evalue']] elif 'pfamscan_bitscore' in query_para and query_para[ 'pfamscan_bitscore'] != "": cmd += ['--pfamscan_bitscore', query_para['pfamscan_bitscore']] if 'pfamscan_clanoverlap' in query_para: if query_para['pfamscan_clanoverlap'] == False: cmd += ['--pfamscan_clan-overlap', 'no'] else: cmd += ['--pfamscan_clan-overlap', 'yes'] if 'jackhmmer_iteration' in query_para and query_para[ 'jackhmmer_iteration'] != "": cmd += [ '--jackhmmer_max_iter', query_para['jackhmmer_iteration'] ] if 'jackhmmer_threshold_type' in query_para and query_para[ 'jackhmmer_threshold_type'] != "": cmd += [ '--jackhmmer-threshold-type', query_para['jackhmmer_threshold_type'] ] if 'jackhmmer_evalue' in query_para and query_para[ 'jackhmmer_evalue'] != "": cmd += ['--jackhmmer_e-val', query_para['jackhmmer_evalue']] elif 'jackhmmer_bitscore' in query_para and query_para[ 'jackhmmer_bitscore'] != "": cmd += [ '--jackhmmer_bit-score', query_para['jackhmmer_bitscore'] ] if 'psiblast_iteration' in query_para and query_para[ 'psiblast_iteration'] != "": cmd += ['--psiblast_iter', query_para['psiblast_iteration']] if 'psiblast_outfmt' in query_para and query_para[ 'psiblast_outfmt'] != "": cmd += ['--psiblast_outfmt', query_para['psiblast_outfmt']] (t_success, runtime_in_sec) = webcom.RunCmd(cmd, runjob_logfile, runjob_errfile, True) aaseqfile = "%s/seq.fa" % (tmp_outpath_this_seq + os.sep + "query_0") if not os.path.exists(aaseqfile): seqcontent = ">%s\n%s\n" % (description, seq) myfunc.WriteFile(seqcontent, aaseqfile, "w") if os.path.exists(tmp_outpath_this_seq): cmd = [ "mv", "-f", tmp_outpath_this_seq + os.sep + "query_0", outpath_this_seq ] isCmdSuccess = False (isCmdSuccess, t_runtime) = webcom.RunCmd(cmd, runjob_logfile, runjob_errfile, True) if not 'isKeepTempFile' in query_para or query_para[ 'isKeepTempFile'] == False: try: temp_result_folder = "%s/temp" % (outpath_this_seq) shutil.rmtree(temp_result_folder) except: msg = "Failed to delete the folder %s" % ( temp_result_folder) date_str = time.strftime(g_params['FORMAT_DATETIME']) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile, "a", True) flist = [ "%s/outputs/%s" % (outpath_this_seq, "Alignment.txt"), "%s/outputs/%s" % (outpath_this_seq, "tableOut.txt"), "%s/outputs/%s" % (outpath_this_seq, "fullOut.txt") ] for f in flist: if os.path.exists(f): try: os.remove(f) except: msg = "Failed to delete the file %s" % (f) date_str = time.strftime( g_params['FORMAT_DATETIME']) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile, "a", True) if isCmdSuccess: timefile = "%s/time.txt" % (outpath_this_seq) runtime = webcom.ReadRuntimeFromFile(timefile, default_runtime=0.0) info_finish = webcom.GetInfoFinish_PRODRES( outpath_this_seq, origIndex, len(seq), description, source_result="newrun", runtime=runtime) myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) # now write the text output for this seq info_this_seq = "%s\t%d\t%s\t%s" % ( "seq_%d" % origIndex, len(seq), description, seq) resultfile_text_this_seq = "%s/%s" % (outpath_this_seq, "query.result.txt") #webcom.WriteSubconsTextResultFile(resultfile_text_this_seq, # outpath_result, [info_this_seq], runtime_in_sec, g_params['base_www_url']) # create or update the md5 cache # create cache only on the front-end if webcom.IsFrontEndNode(g_params['base_www_url']): md5_key = hashlib.md5( (seq + str(query_para)).encode('utf-8')).hexdigest() subfoldername = md5_key[:2] md5_subfolder = "%s/%s" % (path_cache, subfoldername) cachedir = "%s/%s/%s" % (path_cache, subfoldername, md5_key) # copy the zipped folder to the cache path origpath = os.getcwd() os.chdir(outpath_result) shutil.copytree("seq_%d" % (origIndex), md5_key) cmd = ["zip", "-rq", "%s.zip" % (md5_key), md5_key] webcom.RunCmd(cmd, runjob_logfile, runjob_logfile) if not os.path.exists(md5_subfolder): os.makedirs(md5_subfolder) shutil.move("%s.zip" % (md5_key), "%s.zip" % (cachedir)) shutil.rmtree( md5_key ) # delete the temp folder named as md5 hash os.chdir(origpath) # Add the finished date to the database date_str = time.strftime(FORMAT_DATETIME) webcom.InsertFinishDateToDB(date_str, md5_key, seq, finished_date_db) all_end_time = time.time() all_runtime_in_sec = all_end_time - all_begin_time if not g_params['isOnlyGetCache'] or len(toRunDict) == 0: # now write the text output to a single file statfile = "%s/%s" % (outpath_result, "stat.txt") #webcom.WriteSubconsTextResultFile(resultfile_text, outpath_result, maplist, # all_runtime_in_sec, g_params['base_www_url'], statfile=statfile) # now making zip instead (for windows users) # note that zip rq will zip the real data for symbolic links os.chdir(outpath) # cmd = ["tar", "-czf", tarball, resultpathname] cmd = ["zip", "-rq", zipfile, resultpathname] webcom.RunCmd(cmd, runjob_logfile, runjob_errfile) # write finish tag file if os.path.exists(finished_seq_file): webcom.WriteDateTimeTagFile(finishtagfile, runjob_logfile, runjob_errfile) isSuccess = False if (os.path.exists(finishtagfile) and os.path.exists(zipfile_fullpath)): isSuccess = True else: isSuccess = False webcom.WriteDateTimeTagFile(failedtagfile, runjob_logfile, runjob_errfile) # send the result to email # do not sendmail at the cloud VM if webcom.IsFrontEndNode(g_params['base_www_url'] ) and myfunc.IsValidEmailAddress(email): if isSuccess: finish_status = "success" else: finish_status = "failed" webcom.SendEmail_on_finish( jobid, g_params['base_www_url'], finish_status, name_server="PRODRES", from_email="*****@*****.**", to_email=email, contact_email=contact_email, logfile=runjob_logfile, errfile=runjob_errfile) if os.path.exists(runjob_errfile) and os.path.getsize(runjob_errfile) > 1: return 1 else: try: shutil.rmtree(tmpdir) msg = "rmtree(%s)" % (tmpdir) webcom.loginfo("rmtree(%s)" % (tmpdir), runjob_logfile) except Exception as e: msg = "Failed to rmtree(%s)" % (tmpdir) webcom.loginfo( "Failed to rmtree(%s) with error message: %s" % (tmpdir, str(e)), runjob_errfile) return 0
def RunJob(infile, outpath, tmpdir, email, jobid, g_params): #{{{ all_begin_time = time.time() rootname = os.path.basename(os.path.splitext(infile)[0]) starttagfile = "%s/runjob.start" % (outpath) runjob_errfile = "%s/runjob.err" % (outpath) runjob_logfile = "%s/runjob.log" % (outpath) app_logfile = "%s/app.log" % (outpath) finishtagfile = "%s/runjob.finish" % (outpath) rmsg = "" resultpathname = jobid outpath_result = "%s/%s" % (outpath, resultpathname) tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname) tarball = "%s.tar.gz" % (resultpathname) zipfile = "%s.zip" % (resultpathname) tarball_fullpath = "%s.tar.gz" % (outpath_result) zipfile_fullpath = "%s.zip" % (outpath_result) resultfile_text = "%s/%s" % (outpath_result, "query.top") mapfile = "%s/seqid_index_map.txt" % (outpath_result) finished_seq_file = "%s/finished_seqs.txt" % (outpath_result) finished_idx_file = "%s/finished_seqindex.txt" % (outpath) for folder in [outpath_result, tmp_outpath_result]: try: os.makedirs(folder) except OSError: msg = "Failed to create folder %s" % (folder) myfunc.WriteFile(msg + "\n", gen_errfile, "a") return 1 try: open(finished_seq_file, 'w').close() except: pass #first getting result from caches # ================================== maplist = [] maplist_simple = [] toRunDict = {} hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0) if hdl.failure: isOK = False else: webcom.WriteDateTimeTagFile(starttagfile, runjob_logfile, runjob_errfile) recordList = hdl.readseq() cnt = 0 origpath = os.getcwd() while recordList != None: for rd in recordList: isSkip = False # temp outpath for the sequence is always seq_0, and I feed # only one seq a time to the workflow tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % 0) outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % cnt) subfoldername_this_seq = "seq_%d" % (cnt) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass maplist.append( "%s\t%d\t%s\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq)) maplist_simple.append( "%s\t%d\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description)) if not g_params['isForceRun']: md5_key = hashlib.md5(rd.seq.encode('utf-8')).hexdigest() subfoldername = md5_key[:2] cachedir = "%s/%s/%s" % (path_cache, subfoldername, md5_key) zipfile_cache = cachedir + ".zip" if os.path.exists(cachedir) or os.path.exists( zipfile_cache): if os.path.exists(cachedir): try: shutil.copytree(cachedir, outpath_this_seq) except Exception as e: msg = "Failed to copytree %s -> %s" % ( cachedir, outpath_this_seq) date_str = time.strftime(FORMAT_DATETIME) myfunc.WriteFile( "[%s] %s with errmsg=%s\n" % (date_str, msg, str(e)), runjob_errfile, "a") elif os.path.exists(zipfile_cache): cmd = [ "unzip", zipfile_cache, "-d", outpath_result ] webcom.RunCmd(cmd, runjob_logfile, runjob_errfile) shutil.move("%s/%s" % (outpath_result, md5_key), outpath_this_seq) checkfile = "%s/query.predict.png" % (outpath_this_seq) fafile_this_seq = '%s/seq.fa' % (outpath_this_seq) if os.path.exists(outpath_this_seq) and os.path.exists( checkfile): info_finish = webcom.GetInfoFinish_Boctopus2( outpath_this_seq, cnt, len(rd.seq), rd.description, source_result="cached", runtime=0.0) myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) myfunc.WriteFile("%d\n" % (cnt), finished_idx_file, "a", isFlush=True) isSkip = True if not isSkip: # first try to delete the outfolder if exists if os.path.exists(outpath_this_seq): try: shutil.rmtree(outpath_this_seq) except OSError: pass origIndex = cnt numTM = 0 toRunDict[origIndex] = [rd.seq, numTM, rd.description ] #init value for numTM is 0 cnt += 1 recordList = hdl.readseq() hdl.close() myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile) if not g_params['isOnlyGetCache']: torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa") dumplist = [] for key in toRunDict: top = toRunDict[key][0] dumplist.append(">%s\n%s" % (str(key), top)) myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w") del dumplist sortedlist = sorted(list(toRunDict.items()), key=lambda x: x[1][1], reverse=True) #format of sortedlist [(origIndex: [seq, numTM, description]), ...] # submit sequences one by one to the workflow according to orders in # sortedlist for item in sortedlist: origIndex = item[0] seq = item[1][0] description = item[1][2] subfoldername_this_seq = "seq_%d" % (origIndex) outpath_this_seq = "%s/%s" % (outpath_result, subfoldername_this_seq) tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % (0)) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" % (origIndex)) seqcontent = ">query_%d\n%s\n" % (origIndex, seq) myfunc.WriteFile(seqcontent, seqfile_this_seq, "w") if not os.path.exists(seqfile_this_seq): msg = "Failed to generate seq file for index %d" % (origIndex) date_str = time.strftime(FORMAT_DATETIME) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile, "a", True) continue cmd = [runscript, seqfile_this_seq, tmp_outpath_result] (t_success, runtime_in_sec) = webcom.RunCmd(cmd, runjob_logfile, runjob_errfile, True) aaseqfile = "%s/seq.fa" % (tmp_outpath_this_seq) if not os.path.exists(aaseqfile): seqcontent = ">%s\n%s\n" % (description, seq) myfunc.WriteFile(seqcontent, aaseqfile, "w") if os.path.exists(tmp_outpath_this_seq): cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq] (isCmdSuccess, t_runtime) = webcom.RunCmd(cmd, runjob_logfile, runjob_errfile) timefile = "%s/time.txt" % (tmp_outpath_result) targetfile = "%s/time.txt" % (outpath_this_seq) if os.path.exists(timefile) and os.path.exists( outpath_this_seq): try: shutil.move(timefile, targetfile) except: msg = "Failed to move %s/time.txt" % ( tmp_outpath_result) date_str = time.strftime(FORMAT_DATETIME) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile, "a", True) pass if isCmdSuccess: runtime = runtime_in_sec #in seconds info_finish = webcom.GetInfoFinish_Boctopus2( outpath_this_seq, origIndex, len(seq), description, source_result="newrun", runtime=runtime) myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) # now write the text output for this seq info_this_seq = "%s\t%d\t%s\t%s" % ( "seq_%d" % origIndex, len(seq), description, seq) resultfile_text_this_seq = "%s/%s" % (outpath_this_seq, "query.result.txt") webcom.WriteBoctopusTextResultFile( resultfile_text_this_seq, outpath_result, [info_this_seq], runtime_in_sec, g_params['base_www_url']) # create or update the md5 cache # create cache only on the front-end figurefile = "%s/plot/query_0.png" % (outpath_this_seq) # Note: do not create cache is figure file does not exist if webcom.IsFrontEndNode(g_params['base_www_url'] ) and os.path.exists(figurefile): md5_key = hashlib.md5(seq.encode('utf-8')).hexdigest() subfoldername = md5_key[:2] md5_subfolder = "%s/%s" % (path_cache, subfoldername) cachedir = "%s/%s/%s" % (path_cache, subfoldername, md5_key) if os.path.exists(cachedir): try: shutil.rmtree(cachedir) except: msg = "Failed to shutil.rmtree(%s)" % ( cachedir) date_str = time.strftime(FORMAT_DATETIME) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile, "a", True) pass if not os.path.exists(md5_subfolder): try: os.makedirs(md5_subfolder) except: pass if os.path.exists(md5_subfolder ) and not os.path.exists(cachedir): cmd = ["mv", "-f", outpath_this_seq, cachedir] webcom.RunCmd(cmd, runjob_logfile, runjob_errfile) if not os.path.exists( outpath_this_seq) and os.path.exists(cachedir): rela_path = os.path.relpath( cachedir, outpath_result) #relative path try: os.chdir(outpath_result) os.symlink(rela_path, subfoldername_this_seq) except: pass all_end_time = time.time() all_runtime_in_sec = all_end_time - all_begin_time if not g_params['isOnlyGetCache'] or len(toRunDict) == 0: # now write the text output to a single file statfile = "%s/%s" % (outpath_result, "stat.txt") webcom.WriteBoctopusTextResultFile(resultfile_text, outpath_result, maplist, all_runtime_in_sec, g_params['base_www_url'], statfile=statfile) # now making zip instead (for windows users) # note that zip rq will zip the real data for symbolic links os.chdir(outpath) cmd = ["zip", "-rq", zipfile, resultpathname] webcom.RunCmd(cmd, runjob_logfile, runjob_errfile) # write finish tag file if os.path.exists(finished_seq_file): webcom.WriteDateTimeTagFile(finishtagfile, runjob_logfile, runjob_errfile) isSuccess = False if (os.path.exists(finishtagfile) and os.path.exists(zipfile_fullpath)): isSuccess = True else: isSuccess = False failedtagfile = "%s/runjob.failed" % (outpath) webcom.WriteDateTimeTagFile(failedtagfile, runjob_logfile, runjob_errfile) # send the result to email # do not sendmail at the cloud VM if webcom.IsFrontEndNode(g_params['base_www_url'] ) and myfunc.IsValidEmailAddress(email): if isSuccess: finish_status = "success" else: finish_status = "failed" webcom.SendEmail_on_finish( jobid, g_params['base_www_url'], finish_status, name_server="BOCTOPUS2", from_email="[email protected])", to_email=email, contact_email=contact_email, logfile=runjob_logfile, errfile=runjob_errfile) if os.path.exists(runjob_errfile) and os.path.getsize(runjob_errfile) > 1: return 1 else: date_str = time.strftime(FORMAT_DATETIME) try: shutil.rmtree(tmpdir) msg = "rmtree(%s)" % (tmpdir) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_logfile, "a", True) except Exception as e: msg = "Failed to rmtree(%s)" % (tmpdir) myfunc.WriteFile("[%s] %s\n" % (date_str, msg), runjob_errfile, "a", True) pass return 0
def RunJob_msa(infile, outpath, tmpdir, email, jobid, g_params): #{{{ all_begin_time = time.time() rootname = os.path.basename(os.path.splitext(infile)[0]) runjob_errfile = "%s/runjob.err" % (outpath) runjob_logfile = "%s/runjob.log" % (outpath) starttagfile = "%s/runjob.start" % (outpath) finishtagfile = "%s/runjob.finish" % (outpath) failtagfile = "%s/runjob.failed" % (outpath) rmsg = "" qdinit_start_tagfile = "%s/runjob.qdinit.start" % (outpath) # if the daemon starts to process the job before the run_job.py running # in the local queue, skip it if os.path.exists(qdinit_start_tagfile): return 0 resultpathname = jobid outpath_result = "%s/%s" % (outpath, resultpathname) tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname) tarball = "%s.tar.gz" % (resultpathname) zipfile = "%s.zip" % (resultpathname) tarball_fullpath = "%s.tar.gz" % (outpath_result) zipfile_fullpath = "%s.zip" % (outpath_result) resultfile_text = "%s/%s" % (outpath_result, "query.result.txt") mapfile = "%s/seqid_index_map.txt" % (outpath_result) finished_seq_file = "%s/finished_seqs.txt" % (outpath_result) finished_idx_file = "%s/finished_seqindex.txt" % (outpath) for folder in [outpath_result, tmp_outpath_result]: try: os.makedirs(folder) except OSError: msg = "Failed to create folder %s" % (folder) myfunc.WriteFile(msg + "\n", gen_errfile, "a") return 1 try: open(finished_seq_file, 'w').close() except: pass #first getting result from caches # ================================== maplist = [] toRunDict = {} hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0) if hdl.failure: isOK = False else: webcom.WriteDateTimeTagFile(starttagfile, runjob_logfile, runjob_errfile) cnt = 0 origpath = os.getcwd() con = sqlite3.connect(db_cache_SCAMPI2MSA) with con: cur = con.cursor() cur.execute(""" CREATE TABLE IF NOT EXISTS %s ( md5 VARCHAR(100), seq VARCHAR(30000), top VARCHAR(30000), PRIMARY KEY (md5) )""" % (dbmsa_tablename)) recordList = hdl.readseq() while recordList != None: for rd in recordList: isSkip = False if not g_params['isForceRun']: md5_key = hashlib.md5( rd.seq.encode('utf-8')).hexdigest() cmd = "SELECT md5, seq, top FROM %s WHERE md5 = \"%s\"" % ( dbmsa_tablename, md5_key) cur.execute(cmd) rows = cur.fetchall() for row in rows: top = row[2] numTM = myfunc.CountTM(top) # info_finish has 8 items info_finish = [ "seq_%d" % cnt, str(len(rd.seq)), str(numTM), "cached", str(0.0), rd.description, rd.seq, top ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) myfunc.WriteFile("%d\n" % (cnt), finished_idx_file, "a", isFlush=True) isSkip = True if not isSkip: # first try to delete the outfolder if exists origIndex = cnt numTM = 0 toRunDict[origIndex] = [rd.seq, numTM, rd.description ] #init value for numTM is 0 cnt += 1 recordList = hdl.readseq() hdl.close() if not g_params['isOnlyGetCache']: torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa") dumplist = [] for key in toRunDict: top = toRunDict[key][0] dumplist.append(">%s\n%s" % (str(key), top)) myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w") del dumplist sortedlist = sorted(list(toRunDict.items()), key=lambda x: x[1][1], reverse=True) #format of sortedlist [(origIndex: [seq, numTM, description]), ...] # submit sequences one by one to the workflow according to orders in # sortedlist for item in sortedlist: origIndex = item[0] seq = item[1][0] description = item[1][2] outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex) tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % (0)) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass try: os.makedirs(tmp_outpath_this_seq) except OSError: g_params['runjob_err'].append( "Failed to create the tmp_outpath_this_seq %s" % (tmp_outpath_this_seq)) continue seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" % (origIndex)) seqcontent = ">%d\n%s\n" % (origIndex, seq) myfunc.WriteFile(seqcontent, seqfile_this_seq, "w") if not os.path.exists(seqfile_this_seq): g_params['runjob_err'].append( "failed to generate seq index %d" % (origIndex)) continue if not os.path.exists("%s/seq.fa" % (tmp_outpath_this_seq)): try: shutil.copyfile(seqfile_this_seq, "%s/seq.fa" % (tmp_outpath_this_seq)) except OSError: pass numCPU = 4 outtopfile = "%s/query.top" % (tmp_outpath_this_seq) cmd = [ runscript_msa, seqfile_this_seq, outtopfile, blastdir, blastdb ] (t_success, runtime_in_sec) = webcom.RunCmd(cmd, runjob_logfile, runjob_errfile, verbose=True) if os.path.exists(tmp_outpath_this_seq): cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq] (isCmdSuccess, t_runtime) = webcom.RunCmd(cmd, runjob_logfile, runjob_errfile, verbose=True) if isCmdSuccess: runtime = runtime_in_sec #in seconds predfile = "%s/query.top" % (outpath_this_seq) (seqid, seqanno, top) = myfunc.ReadSingleFasta(predfile) numTM = myfunc.CountTM(top) # info_finish has 8 items info_finish = [ "seq_%d" % origIndex, str(len(seq)), str(numTM), "newrun", str(runtime), description, seq, top ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) all_end_time = time.time() all_runtime_in_sec = all_end_time - all_begin_time if len(g_params['runjob_log']) > 0: rt_msg = myfunc.WriteFile("\n".join(g_params['runjob_log']) + "\n", runjob_logfile, "a") if rt_msg: g_params['runjob_err'].append(rt_msg) if not g_params['isOnlyGetCache'] or len(toRunDict) == 0: if os.path.exists(finished_seq_file): webcom.WriteDateTimeTagFile(finishtagfile, runjob_logfile, runjob_errfile) # now write the text output to a single file dumped_resultfile = "%s/%s" % (outpath_result, "query.top") statfile = "%s/%s" % (outpath_result, "stat.txt") webcom.WriteSCAMPI2MSATextResultFile(dumped_resultfile, outpath_result, maplist, all_runtime_in_sec, g_params['base_www_url'], statfile=statfile) # now making zip instead (for windows users) pwd = os.getcwd() os.chdir(outpath) cmd = ["zip", "-rq", zipfile, resultpathname] webcom.RunCmd(cmd, runjob_logfile, runjob_errfile) os.chdir(pwd) isSuccess = False if (os.path.exists(finishtagfile) and os.path.exists(zipfile_fullpath)): isSuccess = True # delete the tmpdir if succeeded shutil.rmtree(tmpdir) #DEBUG, keep tmpdir else: isSuccess = False webcom.WriteDateTimeTagFile(failtagfile, runjob_logfile, runjob_errfile) finish_status = "" #["success", "failed", "partly_failed"] if isSuccess: finish_status = "success" else: finish_status = "failed" # send the result to email # do not sendmail at the cloud VM if webcom.IsFrontEndNode(g_params['base_www_url'] ) and myfunc.IsValidEmailAddress(email): webcom.SendEmail_on_finish(jobid, g_params['base_www_url'], finish_status, name_server="SCAMPI2-msa", from_email="*****@*****.**", to_email=email, contact_email=contact_email, logfile=runjob_logfile, errfile=runjob_errfile) return 0