def ExtractFromTopconsSingleAllinfo(recordContent, seqid2AnnoDict): #{{{ record = {} lines = recordContent.split("\n") lines = filter(None, lines) numLine = len(lines) if numLine < 1: return {} i = 0 anno = "" otherLineList = [] for line in lines: if line[0:4] == 'NAME': anno += line[9:] else: otherLineList.append(line) seqid = myfunc.GetSeqIDFromAnnotation(anno) if seqid.find("UniRef") == 0: seqid = seqid.split("_")[1] if seqid in seqid2AnnoDict: rd = {} rd['anno'] = seqid2AnnoDict[seqid] rd['otherLineList'] = otherLineList return rd else: return {}
def GetDatabaseIDList(annoList):#{{{ idList = [] for anno in annoList: if anno == "" or anno[0] == "#": continue firstword = myfunc.GetFirstWord(anno) lengthword = len(firstword) p1 = firstword.find('(') if p1 == -1: p1 = lengthword p2 = firstword.find('/') if p2 == -1: p2 = lengthword firstword = firstword[:min(p1,p2)] if firstword.find("target") != -1: pass else: seqid = myfunc.GetSeqIDFromAnnotation(firstword) idList.append(seqid) #print len(myfunc.uniquelist(idList)) #print len(set(idList)) idList = myfunc.uniquelist(idList) return idList
def Phobius2Fasta(infile, outpath): #{{{ try: rootname = os.path.basename(os.path.splitext(infile)[0]) outfile = outpath + os.sep + rootname + "_PHOBIUS.topo" fpout = open(outfile, "w") fpin = open(infile, "r") unprocessedBuffer = "" isEOFreached = False processedTopoIDSet = set([]) while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = Read_phobius_result_from_buffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: seqid = myfunc.GetSeqIDFromAnnotation(record['anno']) if record['predtopo'] != "": fpout.write(">%s\n" % (record['anno'])) fpout.write("%s\n" % record['predtopo']) if isEOFreached == True: break fpin.close() fpout.close() print "Results have been output to" print "\t%s" % outfile except IOError: print >> sys.stderr, "Failed to open file %s for read" % (infile) raise
def ReadTopoWithDGScore(inFile): #{{{ # return topoAlnWithDGScoreList indexID2 # indexID2 is a hash table to quickly locate the index of # topoWithDGScoreList given the id2 topoWithDGScoreList = [] indexID = {} fpin = open(inFile, "r") lines = fpin.readlines() fpin.close() i = 0 cntAlign = 0 while i < len(lines): line = lines[i] if line[0] == '>': topoWithDGScoreList.append({}) annoLine = lines[i].lstrip(">").strip() topo = lines[i + 1].strip() dgscores = lines[i + 2].replace("/", "").split() dgscores = [float(dg) for dg in dgscores] seqID = myfunc.GetSeqIDFromAnnotation(annoLine) topoWithDGScoreList[cntAlign]['annoline'] = annoLine topoWithDGScoreList[cntAlign]['id'] = seqID topoWithDGScoreList[cntAlign]['topo'] = topo topoWithDGScoreList[cntAlign]['dgscores'] = dgscores indexID[seqID] = (cntAlign) cntAlign += 1 i = i + 1 i += 1 return (topoWithDGScoreList, indexID)
def GetFastaID2(infile, fpout): #{{{ # The faster version isPrintAnnoLine = g_params['isPrintAnnoLine'] fpin = open(infile, "r") buff = fpin.read(BLOCK_SIZE) brokenAnnoLine = "" ##for the annotation line broken by BLOCK read while buff: beg = 0 end = 0 while 1: if brokenAnnoLine: end = buff.find("\n") if end >= 0: line = brokenAnnoLine + buff[0:end] line = line.lstrip(">").rstrip("\n") idd = myfunc.GetSeqIDFromAnnotation(line) if not isPrintAnnoLine: print >> fpout, idd else: fpout.write("%s\t%s\n" % (idd, line)) brokenAnnoLine = "" beg = end else: brokenAnnoLine += buff break beg = buff.find(">", beg) end = buff.find("\n", beg + 1) if beg >= 0: if end >= 0: line = buff[beg:end] line = line.lstrip(">").rstrip("\n") idd = myfunc.GetSeqIDFromAnnotation(line) if not isPrintAnnoLine: print >> fpout, idd else: fpout.write("%s\t%s\n" % (idd, line)) beg = end else: brokenAnnoLine = buff[beg:] break else: break buff = fpin.read(BLOCK_SIZE) fpin.close()
def BlastM9toPairlist(infile, fpout): #{{{ recordList = ReadBlastOutput(infile, iteration=g_params['iteration'], fmt=9) #create a new dict to sort the idlist by evalue in ascending order for i in range(min(1, len(recordList))): rd = recordList[i] print >> fpout, rd['queryID'], myfunc.GetSeqIDFromAnnotation( rd['hitID'])
def WriteOnelineSeq(seqWithAnno, idSet, fpout): #{{{ "Write sequence in oneline, sequences with redundant IDs are ignored" seqid = myfunc.GetSeqIDFromAnnotation( seqWithAnno[0:seqWithAnno.find('\n')]) aaSeq = seqWithAnno[seqWithAnno.find("\n"):].replace('\n', '').replace(' ', '') if seqid not in idSet: seqWithAnno += "\n" fpout.write("%s %s\n" % (seqid, aaSeq)) idSet.add(seqid)
def ReplaceFastaAnnotation(fastaFile, annotationFile, fpout): # this version read in the annotation file first cntAnno = 0 cntLineFasta = 0 lineFa = "" lineAnno = "" fpFa = open(fastaFile, "r") fpAnno = open(annotationFile, "r") lineAnno = fpAnno.readline() cntAnno += 1 while lineAnno: lineAnno = lineAnno.rstrip('\n').strip() if lineAnno and lineAnno[0] == ">": if not (lineFa and lineFa[0]) == ">": lineFa = fpFa.readline() cntLineFasta += 1 lineFa = lineFa.rstrip('\n').strip() if lineFa and lineFa[0] == ">": #starting a new sequnce idFa = myfunc.GetSeqIDFromAnnotation(lineFa) idAnno = myfunc.GetSeqIDFromAnnotation(lineAnno) if idFa != idAnno: print >> sys.stderr, "annotation line does not match, annoRecord=%d, fastaLine=%d" % ( cntAnno + 1, cntLineFasta + 1) print >> sys.stderr, "idFa=%s, idAnno=%s" % (idFa, idAnno) sys.exit(1) print >> fpout, "%s" % lineAnno lineFa = fpFa.readline() cntLineFasta += 1 lineFa = lineFa.rstrip("\n").strip() while lineFa and lineFa[0] != ">": print >> fpout, "%s" % lineFa lineFa = fpFa.readline() cntLineFasta += 1 lineFa = lineFa.rstrip("\n").strip() lineAnno = fpAnno.readline() cntAnno += 1 fpFa.close() fpAnno.close() return 0
def GetFromRawSeq(seqWithAnno, isPrintID, isJustPrintSum, fpout):#{{{ # begseq=seqWithAnno.find("\n") # seq=seqWithAnno[begseq:] # seq=seq.replace('\n','').replace(' ','') length=len(seqWithAnno[seqWithAnno.find("\n"):].replace('\n','').replace(' ','')) if not isJustPrintSum: if isPrintID: seqID=myfunc.GetSeqIDFromAnnotation(seqWithAnno) fpout.write("%s\t"%seqID) fpout.write("%d\n" % length) return length
def GetFastaID(infile, fpout): #{{{ fpin = open(infile, "r") line = fpin.readline() while line: line = line.rstrip('\n').strip() if line and line[0] == ">": idd = myfunc.GetSeqIDFromAnnotation(line) print >> fpout, idd line = fpin.readline() fpin.close() return 0
def GetHMMScore(inFile):#{{{ # return (idList, lengthList, normLogList, logoddsList, reversiList, isTMProList); idList=[]; lengthList=[]; normLogList=[]; logoddsList=[]; reversiList=[]; isTMProList=[]; fpin = open(inFile, "r"); line = fpin.readline(); while line: if line.find("SeqID:")>=0: seqID="."; normLog="."; logodds="."; reversi="."; isTMPro="."; length=0; strs=line.split(":"); if strs[1].strip(): seqID=strs[1].strip(); while 1: line = fpin.readline(); if not line.strip(): break; strs=line.split(":"); if strs[0] == "SeqLength": length=int(strs[1]); elif strs[0] == "NormalizedLogLikelihood": normLog=strs[1].strip(); elif strs[0] == "Logodds": logodds=strs[1].strip(); elif strs[0] == "Reversi": reversi=strs[1].strip(); elif strs[0] == "IsTMProtein": isTMPro=strs[1].strip(); elif strs[0][0] == ">": if seqID=='.': seqID=myfunc.GetSeqIDFromAnnotation(strs[0]); idList.append(seqID); lengthList.append(length); normLogList.append(normLog); logoddsList.append(logodds); reversiList.append(reversi); isTMProList.append(isTMPro); line = fpin.readline(); fpin.close(); return (idList, lengthList, normLogList, logoddsList, reversiList, isTMProList);
def OutputSplittedSeq( seqWithAnno, rootname, cntsplit, #{{{ cntseq_of_split, fpout): # return (cntsplit, cntseq_of_split, fpout) begseq = seqWithAnno.find("\n") seq = seqWithAnno[begseq:] seqID = myfunc.GetSeqIDFromAnnotation(seqWithAnno[0:begseq]) outfile = "" if cntseq_of_split < g_params['numseq_per_split']: if fpout == None: if g_params['isNameFileSequentially']: outfile = (g_params['outpath'] + os.sep + rootname + "_%d" % cntsplit + "." + g_params['file_ext']) else: if seqID == "": seqID = rootname + "_%d" % cntsplit outfile = (g_params['outpath'] + os.sep + seqID + "." + g_params['file_ext']) try: fpout = open(outfile, "w") except IOError: print >> sys.stderr, "Failed to write to file %s" % outfile return 1 fpout.write("%s" % seqWithAnno[0:begseq]) fpout.write("%s\n" % seq) cntseq_of_split += 1 if cntseq_of_split >= g_params['numseq_per_split']: fpout.close() fpout = None cntseq_of_split = 0 cntsplit += 1 if g_params['verbose'] >= 2: if outfile != "": print >> sys.stdout, "split %d\t%s output" % (cntsplit, outfile) else: msg = "Error! cntseq_of_split (%d) >= numseq_per_split (%d)" print >> sys.stderr, msg % (cntseq_of_split, g_params['numseq_per_split']) return (cntsplit, cntseq_of_split, fpout)
def ReadSignalPFile(infile): #{{{ hdl = myfunc.ReadLineByBlock(infile) dt = {} if hdl.failure: return 1 lines = hdl.readlines() while lines != None: for line in lines: if line == "" or line[0] == "#": continue #seqid = myfunc.GetFirstWord(line) seqid = myfunc.GetSeqIDFromAnnotation(line) dt[seqid] = line lines = hdl.readlines() hdl.close() return dt
def SelectLineByID(infile, idListSet, fpout):#{{{ hdl = myfunc.ReadLineByBlock(infile) if hdl.failure: return 1 method_getid = g_params['method_getid'] sel_field_list = g_params['sel_field_list'] if method_getid == 3: if len(sel_field_list) == 0: sel_field = 0 elif len(sel_field_list) == 1: sel_field = sel_field_list[0] lines = hdl.readlines() while lines != None: for line in lines: if not line or line[0] == "#": fpout.write("%s\n"%line) else: try: if method_getid == 0: idd = line.split(None, 1)[0] elif method_getid == 1: idd = (line.split(None, 1)[0]).partition(";")[0] elif method_getid == 2: idd = myfunc.GetSeqIDFromAnnotation(line) elif method_getid == 3: if len(sel_field_list) < 2: idd = line.split()[sel_field-1] else: strs = line.split() tmpli = [] for ff in sel_field_list: tmpli.append(strs[ff-1]) idd = tuple(tmpli) else: print method_getid except (IndexError): print >> sys.stderr, ("Bad line \"%s\"\n"%line) if idd in idListSet: fpout.write("%s\n"%line) lines = hdl.readlines() hdl.close() return 0
def ReadSingleFasta_modhmm(inFile): seqID = "" aaSeq = "" annotation = "" try: fpin = open(inFile, "r") line = fpin.readline() while line: line = line.rstrip('\n').strip() if line: if line[0] == ">": seqID = myfunc.GetSeqIDFromAnnotation(line) annotation = line.lstrip(">").strip() elif line[0] != "/": #neglecting membrane topology labelling aaSeq = aaSeq + line line = fpin.readline() fpin.close() except: print >> sys.stderr, "Except for the input file ", inFile, "in the function ReadSingleFasta" return (seqID, annotation, aaSeq)
def ReadSignalPeptide_signalp(infile): try: signalpDict = {} fpin = open(infile, "r") lines = fpin.readlines() fpin.close() for line in lines: if line[0] != "#": strs = line.split() try: status = strs[9] if status == "Y": seqid = myfunc.GetSeqIDFromAnnotation(strs[0]) signalpDict[seqid] = int(strs[2]) except IndexError: pass return signalpDict except IOError: print >> sys.stderr, "Failed to read infile %s"%infile return {}
def WriteIndexFasta(seqWithAnno, fpdb, dbname, fpindex, cntdbfile, #{{{ record_offset, idSet, idtype): """Write sequence to indexed fasta file, sequences with redundant IDs are ignored""" if idtype == 0: seqid = myfunc.GetSeqIDFromAnnotation(seqWithAnno) elif idtype == 1: seqid = myfunc.GetFirstWord(seqWithAnno.lstrip(">")) if seqid in idSet: return (fpdb, record_offset) else: seqWithAnno+="\n" if fpdb == None: dbfile=dbname+"%d.db"%(cntdbfile) fpdb=open(dbfile, "wb") print "dbfile %s is created."%dbfile fpindex.write("%s %d %d %d\n"%(seqid, cntdbfile, record_offset, len(seqWithAnno))) fpdb.write("%s"%seqWithAnno) record_offset += len(seqWithAnno) idSet.add(seqid) return (fpdb,record_offset)
def Labeltopologyfastaseq(queryTopoFile, alignFile, fastaFile, fpout): #{{{ # fptmp=open(queryTopoFile); # print fptmp.readlines(); # fptmp.close(); try: (queryID, queryAnnotation, queryTopology) = myfunc.ReadSingleFasta(queryTopoFile) # read in alignment alns = ReadNeedleAlignment(alignFile) # print alns; topologyLabels = GetTopologyLabels(queryTopology, alns) fpin = open(fastaFile, "r") lines = fpin.readlines() fpin.close() i = 0 while i < len(lines): line = lines[i] if line[0] == '>': seqID = myfunc.GetSeqIDFromAnnotation(line) aaSeq = "" fpout.write("%s" % line) i = i + 1 while i < len(lines) and lines[i][0] != '>': fpout.write("%s" % lines[i]) aaSeq += lines[i].strip() i = i + 1 fpout.write("/%s/\n" % topologyLabels[seqID]) if len(aaSeq) != len(topologyLabels[seqID]): print >> sys.stderr, "%s: length not match" % seqID except: print >> sys.stderr, "except for the function:%s" % sys._getframe( ).f_code.co_name raise return 0
def ReadHHAlignResult(infile): try: fpin = open(infile, "r") lines = fpin.read().split("\n") fpin.close() hitList = [] numLine = len(lines) cnt = 0 while cnt < numLine: if cnt < numLine and lines[cnt][0:5] == "Query": query_description = lines[cnt][14:] query_seqid = myfunc.GetSeqIDFromAnnotation(query_description) cnt += 1 elif cnt < numLine and lines[cnt].find("Match_columns") == 0: try: query_legnth = int(lines[cnt].split()[1]) except (IndexError, ValueError, TypeError): print >> sys.stderr, "Bad line:\"%s\"" % (lines[cnt]) raise cnt += 1 elif lines[cnt][0:7] == " No Hit": cnt += 1 while cnt < numLine and lines[cnt] != "": hit = {} #print lines[cnt] (hit['prob'], hit['evalue'], hit['pvalue'], hit['score'], hit['num_align_col'], hit['pos_query_begin'], hit['pos_query_end'], hit['pos_template_begin'], hit['pos_template_end'], hit['template_length']) = ScanHHHitLine(lines[cnt]) hit['query_length'] = query_legnth hit['query_seqid'] = query_seqid hit['query_description'] = query_description hitList.append(hit) cnt += 1 cnt += 1 elif cnt < numLine and lines[cnt][0:3] == "No ": j = 0 jstart = cnt try: hitIndex = int(lines[cnt].split()[1]) - 1 except (IndexError, ValueError, TypeError): print >> sys.stderr, "Bad hit line: %s" % lines[cnt] raise try: hit = hitList[hitIndex] except IndexError: print >> sys.stderr, "hitIndex=%d, numHit=%d" % ( hitIndex, len(hitList)) raise j += 1 hit['hit_description'] = lines[jstart + j].lstrip(">") hit['hit_seqid'] = myfunc.GetSeqIDFromAnnotation( hit['hit_description']) j += 1 hit['statline'] = lines[jstart + j] (hit['prob'], hit['evalue'], hit['score'], hit['alignedcol'], hit['identity'], hit['similarity'], hit['sum_prob']) = ScanHHAlignStatLine(hit['statline']) j += 2 alnseqList1 = [] alnseqList2 = [] while lines[jstart + j][0:2] == "Q ": try: l1 = lines[jstart + j][17:] l2 = lines[jstart + j + 4][17:] s1 = l1.split()[1] s2 = l2.split()[1] alnseqList1.append(s1) alnseqList2.append(s2) if lines[jstart + j + 5].strip() != "": j += 8 else: j += 7 except IndexError: msg = "Bad hhr result file %s. l1=%s, l2=%s" print >> sys.stderr, msg % (infile, l1, l2) sys.exit(1) hit['query_alignseq'] = "".join(alnseqList1) hit['template_alignseq'] = "".join(alnseqList2) cnt += j elif lines[cnt][0:4] == "Done": break else: cnt += 1 return hitList except IOError: print >> sys.stderr, "Failed to read infile %s" % (infile) return []
$dbname0.db -q Quiet mode -h, --help Print this help message and exit Created 2011-12-09, updated 2011-12-09, Nanjiang Shu """ MAX_DBFILE_SIZE=2*1024*1024*1024; BLOCK_SIZE=100000; def PrintHelp(): print usage; def WriteIndex(fpdb, record_offset) = WriteIndex(recordList, fpdb, dbname, #{{{ fpindex, cntdbfile, record_offset, processedTopoIDSet); "Write sequence to indexed fasta file, sequences with redundant IDs are ignored" seqid = myfunc.GetSeqIDFromAnnotation(seqWithAnno); if seqid in idSet: return (fpdb, record_offset); else: seqWithAnno+="\n"; if fpdb == None: dbfile=dbname+"%d.db"%(cntdbfile); fpdb=open(dbfile, "wb"); print "dbfile %s is created."%dbfile; fpindex.write("%s %d %d %d\n"%(seqid, cntdbfile, record_offset, len(seqWithAnno))); fpdb.write("%s"%seqWithAnno); record_offset += len(seqWithAnno); idSet.add(seqid); return (fpdb,record_offset); #}}}
def ExtractFromTopconsResult(recordContent): #{{{ # print # print "recordContent:" # print "==============================" # print recordContent # print "==============================" # print record = {} record['anno'] = "" record['rlty'] = -1 topoNameList = [ 'predtopo_TOPCONS2', 'predtopo_OCTOPUS', 'predtopo_Philius', 'predtopo_PolyPhobius', 'predtopo_SCAMPI_msa', 'predtopo_SPOCTOPUS' ] for name in topoNameList: record[name] = "" lines = recordContent.split("\n") numLine = len(lines) i = 0 while i < numLine: # print i if lines[i].find("Sequence name") == 0: record['anno'] = lines[i][15:] i += 1 elif lines[i].find('SCAMPI predicted topology') == 0: j = 1 while lines[i + j] != "": record['predtopo_SCAMPI_msa'] += lines[i + j] j += 1 i += j elif lines[i].find('Philius predicted topology') == 0: j = 1 while lines[i + j] != "": record['predtopo_Philius'] += lines[i + j] j += 1 i += j elif lines[i].find('PolyPhobius predicted topology') == 0: j = 1 while lines[i + j] != "": record['predtopo_PolyPhobius'] += lines[i + j] j += 1 i += j elif lines[i].find('OCTOPUS predicted topology') == 0: j = 1 while lines[i + j] != "": record['predtopo_OCTOPUS'] += lines[i + j] j += 1 i += j elif lines[i].find('SPOCTOPUS predicted topology') == 0: j = 1 while lines[i + j] != "": record['predtopo_SPOCTOPUS'] += lines[i + j] j += 1 i += j elif lines[i].find('TOPCONS predicted topology') == 0: j = 1 while lines[i + j] != "": record['predtopo_TOPCONS2'] += lines[i + j] j += 1 i += j elif lines[i].find("Predicted TOPCONS reliability") == 0: j = 1 sumrlty = 0.0 cnt = 0 while lines[i + j][0:1].isdigit(): ss = lines[i + j].split() if len(ss) == 2: sumrlty += float(ss[1]) cnt += 1 j += 1 if cnt > 0: record['rlty'] = sumrlty / (cnt) i += j else: i += 1 record['rlty'] *= 100.0 record['seqid'] = myfunc.GetSeqIDFromAnnotation(record['anno']) if record['predtopo_TOPCONS2'] != "": record['seqlength'] = len(record['predtopo_TOPCONS2']) for name in topoNameList: if record[name].find("No TM-regions predicted") != -1: record[name] = "" #print record return record else: return {}
def Topcons2Fasta_method1(infile, outpath): #{{{ try: rootname = os.path.basename(os.path.splitext(infile)[0]) outfile_SPlist = outpath + os.sep + rootname + "_TOPCONS.sp_list" outfile_TOPCONS = outpath + os.sep + rootname + "_TOPCONS.topo" outfile_TOPCONS_m1 = outpath + os.sep + rootname + "_TOPCONS.m1.topo" outfile_TOPCONS_filterSP = outpath + os.sep + rootname + "_TOPCONS_filterSP.topo" outfile_agreement = outpath + os.sep + rootname + ".agreement.stat.txt" logfile1 = outpath + os.sep + rootname + ".m1.idt.log" logfile2 = outpath + os.sep + rootname + ".m1.nonidt.log" logfile3 = outpath + os.sep + rootname + ".m1.idt.but.numpred.lt.4.log" fpout_SPlist = open(outfile_SPlist, "w") fpout_TOPCONS = open(outfile_TOPCONS, "w") fpout_TOPCONS_m1 = open(outfile_TOPCONS_m1, "w") fpout_TOPCONS_filterSP = open(outfile_TOPCONS_filterSP, "w") fplog1 = open(logfile1, "w") fplog2 = open(logfile2, "w") fplog3 = open(logfile3, "w") fpout_agree = open(outfile_agreement, "w") fpin = open(infile, "r") unprocessedBuffer = "" isEOFreached = False processedTopoIDSet = set([]) while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = Read_topcons_result_from_buffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: if record['predtopo_TOPCONS2'] == "": continue seqid = myfunc.GetSeqIDFromAnnotation(record['anno']) topoList = [] topoList.append(record['predtopo_OCTOPUS']) topoList.append(record['predtopo_SPOCTOPUS']) topoList.append(record['predtopo_SCAMPI_msa']) topoList.append(record['predtopo_PolyPhobius']) topoList.append(record['predtopo_Philius']) # print "=======================" # print seqid # print topoList # print "=======================" # Annotation: matchList, matching the target topology to ordered topology list # 1 for identical, 0 for non identical and -1 for empty topology (matchList, numIDTtopo, numPredictor) = lcmp.MatchTopology( record['predtopo_TOPCONS2'], topoList, min_TM_overlap, seqid) fpout_agree.write("%s\t%d\t%d" % (seqid, numIDTtopo, numPredictor)) for tt in matchList: fpout_agree.write("\t%d" % (tt)) fpout_agree.write("\t%6.2f" % (record['rlty'])) fpout_agree.write("\n") msg = ">%s TOPCONS RLTY=%.2f" \ "numIDTtopo=%d numPredictor=%d\n" if record['predtopo_TOPCONS2'].find('S') >= 0: pp = record['predtopo_TOPCONS2'].rfind('S') fpout_SPlist.write("%s %d %s\n" % (record['seqid'], pp, 'Y')) if record['predtopo_TOPCONS2'].find('M') >= 0: fpout_TOPCONS.write( msg % (record['anno'], record['rlty'], numIDTtopo, numPredictor)) fpout_TOPCONS.write("%s\n" % record['predtopo_TOPCONS2']) if record['predtopo_TOPCONS2'].find('S') >= 0: pp = record['predtopo_TOPCONS2'].rfind('S') iostat = record['predtopo_TOPCONS2'][pp + 1] top = record['predtopo_TOPCONS2'].replace( 'S', iostat) else: top = record['predtopo_TOPCONS2'] fpout_TOPCONS_filterSP.write( msg % (record['anno'], record['rlty'], numIDTtopo, numPredictor)) fpout_TOPCONS_filterSP.write("%s\n" % top) if numIDTtopo == numPredictor: if numPredictor >= 4: msg = ">%s TOPCONS RLTY=%.2f" \ "numIDTtopo=%d numPredictor=%d\n" fpout_TOPCONS_m1.write( msg % (record['anno'], record['rlty'], numIDTtopo, numPredictor)) fpout_TOPCONS_m1.write("%s\n" % record['predtopo_TOPCONS2']) msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d" print >> fplog1, msg % (seqid, record['rlty'], numIDTtopo, numPredictor) else: msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d" print >> fplog3, msg % (seqid, record['rlty'], numIDTtopo, numPredictor) else: msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d" print >> fplog2, msg % (seqid, record['rlty'], numIDTtopo, numPredictor) if isEOFreached == True: break fpin.close() fpout_SPlist.close() fpout_TOPCONS.close() fpout_TOPCONS_m1.close() fpout_TOPCONS_filterSP.close() fplog1.close() fplog2.close() fplog3.close() fpout_agree.close() print "Result have been output to" print "\t%s" % outfile_TOPCONS print "\t%s" % outfile_agreement print "\t%s" % logfile1 print "\t%s" % logfile2 print "\t%s" % logfile3 except IOError: msg = "Failed to read file {} in function {}" print >> sys.stderr, msg.format(infile, sys._getframe().f_code.co_name)
def Topcons2Fasta(infile, outpath): #{{{ try: rootname = os.path.basename(os.path.splitext(infile)[0]) outfile_TOPCONS2 = outpath + os.sep + rootname + "_TOPCONS2.topo" outfile_OCTOPUS = outpath + os.sep + rootname + "_OCTOPUS.topo" outfile_SPOCTOPUS = outpath + os.sep + rootname + "_SPOCTOPUS.topo" outfile_SCAMPI_msa = outpath + os.sep + rootname + "_SCAMPI_msa.topo" outfile_Philius = outpath + os.sep + rootname + "_Philius.topo" outfile_PolyPhobius = outpath + os.sep + rootname + "_PolyPhobius.topo" outRLTYFile = outpath + os.sep + rootname + "_TOPCONS.rlty" fpout_TOPCONS2 = open(outfile_TOPCONS2, "w") fpout_OCTOPUS = open(outfile_OCTOPUS, "w") fpout_SPOCTOPUS = open(outfile_SPOCTOPUS, "w") fpout_SCAMPI_msa = open(outfile_SCAMPI_msa, "w") fpout_Philius = open(outfile_Philius, "w") fpout_PolyPhobius = open(outfile_PolyPhobius, "w") fpout_rlty = open(outRLTYFile, "w") fpin = open(infile, "r") unprocessedBuffer = "" isEOFreached = False processedTopoIDSet = set([]) while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = Read_topcons_result_from_buffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: seqid = myfunc.GetSeqIDFromAnnotation(record['anno']) if record['predtopo_TOPCONS2'] != "": fpout_TOPCONS.write( ">%s predtopo_TOPCONS2 rlty=%.2f\n" % (record['anno'], record['rlty'])) fpout_TOPCONS.write("%s\n" % record['predtopo_TOPCONS2']) if record['predtopo_OCTOPUS'] != "": fpout_OCTOPUS.write(">%s predtopo_OCTOPUS\n" % (record['anno'])) fpout_OCTOPUS.write("%s\n" % record['predtopo_OCTOPUS']) if record['predtopo_SPOCTOPUS'] != "": fpout_SPOCTOPUS.write(">%s predtopo_SPOCTOPUS\n" % (record['anno'])) fpout_SPOCTOPUS.write("%s\n" % record['predtopo_SPOCTOPUS']) if record['predtopo_SCAMPI_msa'] != "": fpout_SCAMPI_msa.write(">%s predtopo_SCAMPI_msa\n" % (record['anno'])) fpout_SCAMPI_msa.write("%s\n" % record['predtopo_SCAMPI_msa']) if record['predtopo_Philius'] != "": fpout_Philius.write(">%s predtopo_Philius\n" % (record['anno'])) fpout_Philius.write("%s\n" % record['predtopo_Philius']) if record['predtopo_PolyPhobius'] != "": fpout_PolyPhobius.write(">%s predtopo_PolyPhobius\n" % (record['anno'])) fpout_PolyPhobius.write("%s\n" % record['predtopo_PolyPhobius']) if record['rlty'] != -100.0: fpout_rlty.write("%s %.2f\n" % (seqid, record['rlty'])) if isEOFreached == True: break fpin.close() fpout_TOPCONS.close() fpout_OCTOPUS.close() fpout_SCAMPI_seq.close() fpout_SCAMPI_msa.close() fpout_PRODIV.close() fpout_PRO.close() fpout_rlty.close() print "Results have been output to" print "\t%s" % outfile_TOPCONS print "\t%s" % outfile_OCTOPUS print "\t%s" % outfile_SCAMPI_seq print "\t%s" % outfile_SCAMPI_msa print "\t%s" % outfile_PRODIV print "\t%s" % outfile_PRO print "\t%s" % outRLTYFile except IOError: print >> sys.stderr, "Failed to open file %s for read" % (infile) raise
def TopconsSingle2Fasta(infile, outpath): #{{{ try: rootname = os.path.basename(os.path.splitext(infile)[0]) outfile_topcons_single = outpath + os.sep + rootname + "_topcons_single.topo" outfile_scampi_single = outpath + os.sep + rootname + "_scampi_single.topo" outfile_hmmtop = outpath + os.sep + rootname + "_hmmtop.topo" outfile_stmhmm = outpath + os.sep + rootname + "_stmhmm.topo" outfile_memsat = outpath + os.sep + rootname + "_memsat.topo" outRLTYFile = outpath + os.sep + rootname + "_topcons_single.rlty" fpout_topcons_single = open(outfile_topcons_single, "w") fpout_scampi_single = open(outfile_scampi_single, "w") fpout_hmmtop = open(outfile_hmmtop, "w") fpout_stmhmm = open(outfile_stmhmm, "w") fpout_memsat = open(outfile_memsat, "w") fpout_rlty = open(outRLTYFile, "w") fpin = open(infile, "r") unprocessedBuffer = "" isEOFreached = False processedTopoIDSet = set([]) while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = Read_topconssingle_result_from_buffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: seqid = myfunc.GetSeqIDFromAnnotation(record['anno']) if record['predtopo_topcons_single'] != "": fpout_topcons_single.write( ">%s topcons_single rlty=%.2f\n" % (record['anno'], record['rlty'])) fpout_topcons_single.write( "%s\n" % record['predtopo_topcons_single']) if record['predtopo_scampi_single'] != "": fpout_scampi_single.write(">%s scampi_single\n" % (record['anno'])) fpout_scampi_single.write( "%s\n" % record['predtopo_scampi_single']) if record['predtopo_hmmtop'] != "": fpout_hmmtop.write(">%s hmmtop\n" % (record['anno'])) fpout_hmmtop.write("%s\n" % record['predtopo_hmmtop']) if record['predtopo_stmhmm'] != "": fpout_stmhmm.write(">%s stmhmm\n" % (record['anno'])) fpout_stmhmm.write("%s\n" % record['predtopo_stmhmm']) if record['predtopo_memsat'] != "": fpout_memsat.write(">%s memsat\n" % (record['anno'])) fpout_memsat.write("%s\n" % record['predtopo_memsat']) if record['rlty'] != -100.0: fpout_rlty.write("%s %.2f\n" % (seqid, record['rlty'])) if isEOFreached == True: break fpin.close() fpout_topcons_single.close() fpout_scampi_single.close() fpout_memsat.close() fpout_stmhmm.close() fpout_hmmtop.close() fpout_rlty.close() print "Result have been output to" print "\t%s" % outfile_topcons_single print "\t%s" % outfile_scampi_single print "\t%s" % outfile_hmmtop print "\t%s" % outfile_stmhmm print "\t%s" % outfile_memsat print "\t%s" % outRLTYFile except IOError: print >> sys.stderr, "Failed to open file %s for read" % (infile) raise
pfamidList = [] extra_desp_dict = {} if extra_description_file != "": hdl_extra = myfunc.ReadLineByBlock(extra_description_file) if hdl_extra.failure: print >> sys.stderr, "Failed to read extra_description_file %s." % ( extra_description_file) return 1 lines = hdl_extra.readlines() while lines != None: for line in lines: line = line.strip() if not line or line[0] == "#": continue seqid = myfunc.GetSeqIDFromAnnotation(line) if seqid != "": extra_desp_dict[seqid] = line lines = hdl_extra.readlines() hdl = myfunc.ReadLineByBlock(mapfile) if hdl.failure: print >> sys.stderr, "Failed to read mapfile %s. exit" % (mapfile) return 1 cntfam = 0 lines = hdl.readlines() while lines != None: for line in lines: line = line.strip() if not line or line[0] == "#":
def WriteSubconsTextResultFile( outfile, outpath_result, maplist, #{{{ runtime_in_sec, base_www_url, statfile=""): try: fpout = open(outfile, "w") if statfile != "": fpstat = open(statfile, "w") date_str = time.strftime(FORMAT_DATETIME) print >> fpout, "##############################################################################" print >> fpout, "Subcons result file" print >> fpout, "Generated from %s at %s" % (base_www_url, date_str) print >> fpout, "Total request time: %.1f seconds." % (runtime_in_sec) print >> fpout, "##############################################################################" cnt = 0 for line in maplist: strs = line.split('\t') subfoldername = strs[0] length = int(strs[1]) desp = strs[2] seq = strs[3] seqid = myfunc.GetSeqIDFromAnnotation(desp) print >> fpout, "Sequence number: %d" % (cnt + 1) print >> fpout, "Sequence name: %s" % (desp) print >> fpout, "Sequence length: %d aa." % (length) print >> fpout, "Sequence:\n%s\n\n" % (seq) rstfile = "%s/%s/%s/query_0_final.csv" % (outpath_result, subfoldername, "plot") if os.path.exists(rstfile): content = myfunc.ReadFile(rstfile).strip() lines = content.split("\n") if len(lines) >= 6: header_line = lines[0].split("\t") if header_line[0].strip() == "": header_line[0] = "Method" header_line = [x.strip() for x in header_line] data_line = [] for i in xrange(1, len(lines)): strs1 = lines[i].split("\t") strs1 = [x.strip() for x in strs1] data_line.append(strs1) content = tabulate.tabulate(data_line, header_line, 'plain') else: content = "" if content == "": content = "***No prediction could be produced with this method***" print >> fpout, "Prediction results:\n\n%s\n\n" % (content) print >> fpout, "##############################################################################" cnt += 1 except IOError: print "Failed to write to file %s" % (outfile)
def TopconsSingle2Fasta_method1(infile, outpath): #{{{ try: rootname = os.path.basename(os.path.splitext(infile)[0]) outfile_topcons_single = outpath + os.sep + rootname + "_topcons_single.m1.topo" outfile_agreement = outpath + os.sep + rootname + ".agreement.stat.txt" logfile1 = outpath + os.sep + rootname + ".m1.idt.log" logfile2 = outpath + os.sep + rootname + ".m1.nonidt.log" fpout_topcons_single = open(outfile_topcons_single, "w") fplog1 = open(logfile1, "w") fplog2 = open(logfile2, "w") fpout_agree = open(outfile_agreement, "w") fpin = open(infile, "r") unprocessedBuffer = "" isEOFreached = False processedTopoIDSet = set([]) while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = Read_topconssingle_result_from_buffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: # ignore cases where topcons_single does not give a prediction if record['predtopo_topcons_single'] == "": continue seqid = myfunc.GetSeqIDFromAnnotation(record['anno']) topoList = [] topoList.append(record['predtopo_scampi_single']) topoList.append(record['predtopo_hmmtop']) topoList.append(record['predtopo_stmhmm']) topoList.append(record['predtopo_memsat']) (matchList, numIDTtopo, numPredictor) = lcmp.MatchTopology( record['predtopo_topcons_single'], topoList, min_TM_overlap, seqid) fpout_agree.write("%s\t%d\t%d" % (seqid, numIDTtopo, numPredictor)) for tt in matchList: fpout_agree.write("\t%d" % (tt)) fpout_agree.write("\t%6.2f" % (record['rlty'])) fpout_agree.write("\n") if numIDTtopo == numPredictor and numIDTtopo >= 2: msg = ">%s topcons_single RLTY=%.2f"\ "numIDTtopo=%d numPredictor=%d\n" fpout_topcons_single.write( msg % (record['anno'], record['rlty'], numIDTtopo, numPredictor)) fpout_topcons_single.write( "%s\n" % record['predtopo_topcons_single']) msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d" print >> fplog1, msg % (seqid, record['rlty'], numIDTtopo, numPredictor) else: msg = "%s RLTY= %.2f numIDTtopo= %d numPredictor= %d" print >> fplog2, msg % (seqid, record['rlty'], numIDTtopo, numPredictor) if isEOFreached == True: break fpin.close() fpout_topcons_single.close() fplog1.close() fplog2.close() fpout_agree.close() print "Result have been output to" print "\t%s" % outfile_topcons_single print "\t%s" % outfile_agreement print "\t%s" % logfile1 print "\t%s" % logfile2 except IOError: msg = "Failed to read file {} in function {}" print >> sys.stderr, msg.format(infile, sys._getframe().f_code.co_name) return 1