def FilterSignalPeptide(topofile, sigpepDict, outfile, isDeleteSeqWithSignalPeptide): hdl = myfunc.ReadFastaByBlock(topofile) if hdl.failure: return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) recordList = hdl.readseq() while recordList != None: for rd in recordList: try: sp_pos = sigpepDict[rd.seqid] except KeyError: sp_pos = -1 if sp_pos != -1: if isDeleteSeqWithSignalPeptide: newtopo = "" else: newtopo = lcmp.FilterSignalPeptideInTopology(rd.seq, sp_pos) else: newtopo = rd.seq if newtopo != "" and myfunc.CountTM(newtopo) > 0: fpout.write(">%s\n"%(rd.description)) fpout.write("%s\n"%(newtopo)) recordList = hdl.readseq() hdl.close() myfunc.myclose(fpout) return 0
def TopoAddDGscore( idListTopo, annotationListTopo, topoList, dgScoreDict, #{{{ fpTopoDG): for iSeq in xrange(len(topoList)): seqID = idListTopo[iSeq] topo = topoList[iSeq] annoLine = annotationListTopo[iSeq] numTM = myfunc.CountTM(topo) #write topoDG fpTopoDG.write(">%s\n" % annoLine) fpTopoDG.write("%s\n" % topo) fpTopoDG.write("{dgscore ") if seqID in dgScoreDict: dglist = dgScoreDict[seqID] numDGscore = len(dglist) if numDGscore != numTM: print >> sys.stderr, ( "num DGscores for seqID %s (%d) " % (seqID, numDGscore) + "!= numTM (%d) for the topology. dglist = " % (numTM)), dglist else: for i in range(numTM): fpTopoDG.write("%s " % dglist[i]) fpTopoDG.write("}\n") return 0
def IsIdenticalTopology_simple( topo1, topo2, min_TM_overlap = 5):#{{{ """Check whether topo1 and topo2 are identical""" # Created 2011-11-15, updated 2011-11-15 # Two topologies are considered identical (Krogh et al. 2001) if # 1. numTM1 == numTM2 # 2. Each helix of the compared topology should overlap by at least N (e.g. 5) # residues # 3. Each helix is oriented in the same way numTM1 = myfunc.CountTM(topo1) numTM2 = myfunc.CountTM(topo2) Nterm1 = GetNtermState(topo1) Nterm2 = GetNtermState(topo2) posTM1 = myfunc.GetTMPosition(topo1) posTM2 = myfunc.GetTMPosition(topo2) if numTM1 != numTM2: return False else: if Nterm1 != Nterm2: return False else: for i in range (numTM1): (b1,e1) = posTM1[i] (b2,e2) = posTM2[i] (common_b, common_e) = (max(b1,b2), min(e1,e2)) overlap = common_e - common_b if overlap <= 0: return False else: cntCommonM = 0 for j in range(common_b, common_e): if topo1[j] == 'M' and topo2[j] == 'M': cntCommonM += 1 if cntCommonM >= min_TM_overlap: break # print ("cntCommonM=", cntCommonM, "min_TM_overlap=", # min_TM_overlap) if cntCommonM < min_TM_overlap: return False return True
def WriteTextResultFile(outfile, outpath_result, maplist, runtime_in_sec, statfile=""):#{{{ try: fpout = open(outfile, "w") fpstat = None numTMPro = 0 if statfile != "": fpstat = open(statfile, "w") cnt = 0 for line in maplist: strs = line.split('\t') subfoldername = strs[0] length = int(strs[1]) desp = strs[2] seq = strs[3] isTMPro = False outpath_this_seq = "%s/%s"%(outpath_result, subfoldername) predfile = "%s/query_topologies.txt"%(outpath_this_seq) g_params['runjob_log'].append("predfile = %s.\n"%(predfile)) if not os.path.exists(predfile): g_params['runjob_log'].append("predfile %s does not exist\n"%(predfile)) (seqid, seqanno, top) = myfunc.ReadSingleFasta(predfile) fpout.write(">%s\n%s\n"%(desp, top)) numTM = myfunc.CountTM(top) if numTM >0: isTMPro = True numTMPro += 1 cnt += 1 if fpstat: out_str_list = ["numTMPro\t%d\n"%(numTMPro)] fpstat.write("%s"%("\n".join(out_str_list))) fpstat.close() except IOError: print "Failed to write to file %s"%(outfile)
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" outfile = "" real_topofile = "" seqfile = "" restrictIDListFile = "" outfile_wrong_predtopo = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-owrong", "--owrong"]: (outfile_wrong_predtopo, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-realtopo", "--realtopo"]: (real_topofile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqfile", "--seqfile"]: (seqfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-mode", "--mode"]: (g_params['mode'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-path_predtopo", "--path_predtopo"]: (g_params['path_predtopo'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-basename", "--basename"]: (g_params['basename'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-restrictidlist", "--restrictidlist"]: (restrictIDListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 elif argv[i] in ["-rmsp", "--rmsp"]: g_params['isRMSP'] = True i += 1 elif argv[i] in ["-debug", "--debug"]: g_params['isDEBUG'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 i += 1 if myfunc.checkfile(g_params['path_predtopo'], "path_predtopo") != 0: return 1 if g_params['basename'] == "": print >> sys.stderr, "%s: basename not set. exit" % (argv[0]) return 1 if myfunc.checkfile(real_topofile, "real_topofile") != 0: return 1 if restrictIDListFile != "": g_params['restrictIDset'] = set(myfunc.ReadIDList(restrictIDListFile)) g_params['isRestrictIDList'] = True if g_params['mode'] == "": if g_params['path_predtopo'].find("topcons_single") >= 0: g_params['mode'] = "tps" elif g_params['path_predtopo'].find("topcons") >= 0: g_params['mode'] = "tp" else: print >> sys.stderr, "mode not set, and can not be recognized from path_predtopo=%s" % ( path_predtopo) return 1 if not g_params['mode'] in ["tp", "tps"]: print >> sys.stderr, "Unrecognized mode = %s" % (g_params['mode']) return 1 (real_idlist, real_annolist, real_topolist) = myfunc.ReadFasta(real_topofile) seqDict = {} if seqfile != "" and os.path.exists(seqfile): (seq_idlist, seq_annolist, seqlist) = myfunc.ReadFasta(seqfile) for i in xrange(len(seq_idlist)): seqDict[seq_idlist[i]] = seqlist[i] if len(real_idlist) <= 0: print >> sys.stderr, "Failed to read real_topofile %s" % ( real_topofile) return 1 real_topodict = {} for i in xrange(len(real_idlist)): real_topodict[real_idlist[i]] = real_topolist[i] fpout = myfunc.myopen(outfile, sys.stdout, "w", False) fpout_wrong = myfunc.myopen(outfile_wrong_predtopo, None, "w", False) idSet_single = set([]) idSet_multi = set([]) for seqid in real_topodict: topo = real_topodict[seqid] numTM = myfunc.CountTM(topo) if numTM == 1: idSet_single.add(seqid) elif numTM > 1: idSet_multi.add(seqid) # print "len(real_topodict)", len(real_topodict) # print "len(idSet_single)", len(idSet_single) # print "len(idSet_multi)", len(idSet_multi) #for TM_type in ["All_Alpha", "Single", "Multi"]: for TM_type in ["All_Alpha"]: if TM_type == "All_Alpha": sub_real_topodict = real_topodict else: sub_real_topodict = {} for seqid in real_topodict: topo = real_topodict[seqid] numTM = myfunc.CountTM(topo) if TM_type == "Single" and numTM == 1: sub_real_topodict[seqid] = topo elif TM_type == "Multi" and numTM > 1: sub_real_topodict[seqid] = topo Benchmark(sub_real_topodict, idSet_single, idSet_multi, TM_type, fpout, fpout_wrong, seqDict) myfunc.myclose(fpout)
def main(g_params): #{{{ numArgv = len(sys.argv) if numArgv < 2: PrintHelp() return 1 isQuiet = False isPrintIDName = True outfile = "" topofile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: isNonOptionArg = False topofile = sys.argv[i] i = i + 1 elif sys.argv[i] == "--": isNonOptionArg = True i = i + 1 elif sys.argv[i][0] == "-": if sys.argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif sys.argv[i] in ["-o", "--o", "-out"]: outfile = sys.argv[i + 1] i = i + 2 elif sys.argv[i] in ["-q", "--q"]: isQuiet = True i = i + 1 elif sys.argv[i] in ["-ni", "--ni", "-noid"]: isPrintIDName = False i = i + 1 else: print "Error! Wrong argument:", sys.argv[i] return 1 else: topofile = sys.argv[i] i = i + 1 if topofile == "": print >> sys.stderr, "topofile not set. Exit." return 1 elif not os.path.exists(topofile): print >> sys.stderr, "topofile %s doe not exist. Exit." % topofile return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) fpin = open(topofile, "rb") if not fpin: print >> sys.stderr, "Failed to open input file %s" % (topofile) return 1 unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer(buff, recordList, isEOFreached) if len(recordList) > 0: for rd in recordList: if isPrintIDName: fpout.write("%s\t" % rd[0]) fpout.write("%d\n" % myfunc.CountTM(rd[2])) if isEOFreached == True: break fpin.close() myfunc.myclose(fpout) return 0
#border color; used when BORDER_WIDTH is above 0 #BORDER_COLOR\t#0000ff #=================================================================# # Actual data follows after the "DATA" keyword # #=================================================================# DATA """ fpout.write(dataset_settings) for i in range(numSeq): gid = idList[i] if gid != 'Consensus': n_i = 0 n_o = 0 NtermState = lcmp.GetNtermState(seqList[i]) numTM = myfunc.CountTM(seqList[i]) if NtermState == 'o': n_i = 0 n_o = numTM else: n_i = numTM n_o = 0 fpout.write("%s\t%d\t%d\n" % (gid, n_i, n_o)) fpout.write("\n") if fpout != sys.stdout: fpout.close()
def RunJob(infile, outpath, tmpdir, email, jobid, g_params): #{{{ all_begin_time = time.time() rootname = os.path.basename(os.path.splitext(infile)[0]) starttagfile = "%s/runjob.start" % (outpath) runjob_errfile = "%s/runjob.err" % (outpath) runjob_logfile = "%s/runjob.log" % (outpath) finishtagfile = "%s/runjob.finish" % (outpath) rmsg = "" resultpathname = jobid outpath_result = "%s/%s" % (outpath, resultpathname) tarball = "%s.tar.gz" % (resultpathname) zipfile = "%s.zip" % (resultpathname) tarball_fullpath = "%s.tar.gz" % (outpath_result) zipfile_fullpath = "%s.zip" % (outpath_result) outfile = "%s/%s/Topcons/topcons.top" % (outpath_result, "seq_%d" % (0)) resultfile_text = "%s/%s" % (outpath_result, "query.result.txt") mapfile = "%s/seqid_index_map.txt" % (outpath_result) finished_seq_file = "%s/finished_seqs.txt" % (outpath_result) tmp_outpath_result = "%s/%s" % (tmpdir, resultpathname) isOK = True try: os.makedirs(tmp_outpath_result) isOK = True except OSError: msg = "Failed to create folder %s" % (tmp_outpath_result) myfunc.WriteFile(msg + "\n", runjob_errfile, "a") isOK = False pass try: os.makedirs(outpath_result) isOK = True except OSError: msg = "Failed to create folder %s" % (outpath_result) myfunc.WriteFile(msg + "\n", runjob_errfile, "a") isOK = False pass if isOK: try: open(finished_seq_file, 'w').close() except: pass #first getting result from caches # ================================== maplist = [] maplist_simple = [] toRunDict = {} hdl = myfunc.ReadFastaByBlock(infile, method_seqid=0, method_seq=0) if hdl.failure: isOK = False else: datetime = time.strftime("%Y-%m-%d %H:%M:%S") rt_msg = myfunc.WriteFile(datetime, starttagfile) recordList = hdl.readseq() cnt = 0 origpath = os.getcwd() while recordList != None: for rd in recordList: isSkip = False # temp outpath for the sequence is always seq_0, and I feed # only one seq a time to the workflow tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % 0) outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % cnt) subfoldername_this_seq = "seq_%d" % (cnt) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass maplist.append( "%s\t%d\t%s\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description, rd.seq)) maplist_simple.append( "%s\t%d\t%s" % ("seq_%d" % cnt, len(rd.seq), rd.description)) if not g_params['isForceRun']: md5_key = hashlib.md5(rd.seq).hexdigest() subfoldername = md5_key[:2] md5_link = "%s/%s/%s" % (path_md5cache, subfoldername, md5_key) if os.path.exists(md5_link): # create a symlink to the cache rela_path = os.path.relpath( md5_link, outpath_result) #relative path os.chdir(outpath_result) os.symlink(rela_path, subfoldername_this_seq) if os.path.exists(outpath_this_seq): runtime = 0.0 #in seconds topfile = "%s/%s/topcons.top" % ( outpath_this_seq, "Topcons") top = myfunc.ReadFile(topfile).strip() numTM = myfunc.CountTM(top) posSP = myfunc.GetSPPosition(top) if len(posSP) > 0: isHasSP = True else: isHasSP = False info_finish = [ "seq_%d" % cnt, str(len(rd.seq)), str(numTM), str(isHasSP), "cached", str(runtime), rd.description ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) isSkip = True if not isSkip: # first try to delete the outfolder if exists if os.path.exists(outpath_this_seq): try: shutil.rmtree(outpath_this_seq) except OSError: pass origIndex = cnt numTM = 0 toRunDict[origIndex] = [rd.seq, numTM, rd.description ] #init value for numTM is 0 cnt += 1 recordList = hdl.readseq() hdl.close() myfunc.WriteFile("\n".join(maplist_simple) + "\n", mapfile) # run scampi single to estimate the number of TM helices and then run # the query sequences in the descending order of numTM torun_all_seqfile = "%s/%s" % (tmp_outpath_result, "query.torun.fa") dumplist = [] for key in toRunDict: top = toRunDict[key][0] dumplist.append(">%s\n%s" % (str(key), top)) myfunc.WriteFile("\n".join(dumplist) + "\n", torun_all_seqfile, "w") del dumplist topfile_scampiseq = "%s/%s" % (tmp_outpath_result, "query.torun.fa.topo") if os.path.exists(torun_all_seqfile): # run scampi to estimate the number of TM helices cmd = [ script_scampi, torun_all_seqfile, "-outpath", tmp_outpath_result ] try: rmsg = subprocess.check_output(cmd) except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e) + "\n") pass if os.path.exists(topfile_scampiseq): (idlist_scampi, annolist_scampi, toplist_scampi) = myfunc.ReadFasta(topfile_scampiseq) for jj in xrange(len(idlist_scampi)): numTM = myfunc.CountTM(toplist_scampi[jj]) try: toRunDict[int(idlist_scampi[jj])][1] = numTM except (KeyError, ValueError, TypeError): pass sortedlist = sorted(toRunDict.items(), key=lambda x: x[1][1], reverse=True) #format of sortedlist [(origIndex: [seq, numTM, description]), ...] # submit sequences one by one to the workflow according to orders in # sortedlist for item in sortedlist: # g_params['runjob_log'].append("tmpdir = %s"%(tmpdir)) #cmd = [script_getseqlen, infile, "-o", tmp_outfile , "-printid"] origIndex = item[0] seq = item[1][0] description = item[1][2] outpath_this_seq = "%s/%s" % (outpath_result, "seq_%d" % origIndex) tmp_outpath_this_seq = "%s/%s" % (tmp_outpath_result, "seq_%d" % (0)) if os.path.exists(tmp_outpath_this_seq): try: shutil.rmtree(tmp_outpath_this_seq) except OSError: pass seqfile_this_seq = "%s/%s" % (tmp_outpath_result, "query_%d.fa" % (origIndex)) seqcontent = ">%d\n%s\n" % (origIndex, seq) myfunc.WriteFile(seqcontent, seqfile_this_seq, "w") if not os.path.exists(seqfile_this_seq): g_params['runjob_err'].append( "failed to generate seq index %d" % (origIndex)) continue cmd = [ runscript, seqfile_this_seq, tmp_outpath_result, blastdir, blastdb ] g_params['runjob_log'].append(" ".join(cmd)) begin_time = time.time() try: rmsg = subprocess.check_output(cmd) g_params['runjob_log'].append("workflow:\n" + rmsg + "\n") except subprocess.CalledProcessError, e: g_params['runjob_err'].append(str(e) + "\n") g_params['runjob_err'].append(rmsg + "\n") pass #suqoutfilelist = glob.glob("%s/*.sh.*.out"%(tmpdir)) #if len(suqoutfilelist)>0: # suqoutfile = suqoutfilelist[0] #g_params['runjob_err'].append(myfunc.ReadFile(suqoutfile)) end_time = time.time() runtime_in_sec = end_time - begin_time if os.path.exists(tmp_outpath_this_seq): cmd = ["mv", "-f", tmp_outpath_this_seq, outpath_this_seq] isCmdSuccess = False try: subprocess.check_output(cmd) isCmdSuccess = True except subprocess.CalledProcessError, e: msg = "Failed to run prediction for sequence No. %d\n" % ( origIndex) g_params['runjob_err'].append(msg) g_params['runjob_err'].append(str(e) + "\n") pass timefile = "%s/time.txt" % (tmp_outpath_result) targetfile = "%s/time.txt" % (outpath_this_seq) if os.path.exists(timefile) and os.path.exists( outpath_this_seq): try: shutil.move(timefile, targetfile) except: g_params['runjob_err'].append( "Failed to move %s/time.txt" % (tmp_outpath_result) + "\n") pass if isCmdSuccess: runtime = runtime_in_sec #in seconds topfile = "%s/%s/topcons.top" % (outpath_this_seq, "Topcons") top = myfunc.ReadFile(topfile).strip() numTM = myfunc.CountTM(top) posSP = myfunc.GetSPPosition(top) if len(posSP) > 0: isHasSP = True else: isHasSP = False info_finish = [ "seq_%d" % origIndex, str(len(seq)), str(numTM), str(isHasSP), "newrun", str(runtime), description ] myfunc.WriteFile("\t".join(info_finish) + "\n", finished_seq_file, "a", isFlush=True) # now write the text output for this seq info_this_seq = "%s\t%d\t%s\t%s" % ( "seq_%d" % origIndex, len(seq), description, seq) resultfile_text_this_seq = "%s/%s" % (outpath_this_seq, "query.result.txt") myfunc.WriteTOPCONSTextResultFile(resultfile_text_this_seq, outpath_result, [info_this_seq], runtime_in_sec, g_params['base_www_url']) # create or update the md5 cache # create cache only on the front-end if g_params['base_www_url'].find("topcons.net") != -1: md5_key = hashlib.md5(seq).hexdigest() subfoldername = md5_key[:2] md5_subfolder = "%s/%s" % (path_md5cache, subfoldername) md5_link = "%s/%s/%s" % (path_md5cache, subfoldername, md5_key) if os.path.exists(md5_link): try: os.unlink(md5_link) except: pass subfolder_md5 = "%s/%s" % (path_md5cache, subfoldername) if not os.path.exists(subfolder_md5): try: os.makedirs(subfolder_md5) except: pass rela_path = os.path.relpath( outpath_this_seq, md5_subfolder) #relative path try: os.chdir(md5_subfolder) os.symlink(rela_path, md5_key) except: pass