def Ana_NumContUnmappedTM(infile): methodList = [0,1,2,3] outpath = os.path.dirname(infile) if outpath == "": outpath = "." try: freqDict = {} for method in methodList: freqDict[2*method] = {} freqDict[2*method+1] = {} unprocessedBuffer="" cntTotalReadInRecord = 0 cntTotalOutputRecord = 0 isEOFreached = False fpin = open(infile) while 1: buff = fpin.read(BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff pairCmpRecordList=[] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff,pairCmpRecordList) if len(pairCmpRecordList) > 0: CountContinuousUnmappedTM_0(pairCmpRecordList, freqDict[0], isFilterNeighbour=True) CountContinuousUnmappedTM_0(pairCmpRecordList, freqDict[1], isFilterNeighbour=False) CountContinuousUnmappedTM_1(pairCmpRecordList, freqDict[2], isFilterNeighbour=True) CountContinuousUnmappedTM_1(pairCmpRecordList, freqDict[3], isFilterNeighbour=False) CountContinuousUnmappedTM_2(pairCmpRecordList, freqDict[4], isFilterNeighbour=True) CountContinuousUnmappedTM_2(pairCmpRecordList, freqDict[5], isFilterNeighbour=False) CountContinuousUnmappedTM_3(pairCmpRecordList, freqDict[6], isFilterNeighbour=True) CountContinuousUnmappedTM_3(pairCmpRecordList, freqDict[7], isFilterNeighbour=False) cntTotalReadInRecord += len(pairCmpRecordList) if isEOFreached == True: break fpin.close() for method in methodList: for idx in [2*method, 2*method+1]: if idx == 2*method: str_filter = "True" outfile = (outpath + os.sep + "tmp_ana_numContTM_method%d_filternb.txt" % (method)) else: str_filter = "False" outfile = (outpath + os.sep + "tmp_ana_numContTM_method%d_nonfilternb.txt" % (method)) fpout = open(outfile, "w") print print >> fpout, "#numTM count Method_%d isFilterNeighbour=%s"%(method, str_filter) for i in range(1, 21): msg = "%-5d %5d" try: print >> fpout, msg%(i, freqDict[idx][i]) except KeyError: print >> fpout, msg%(i, 0) fpout.close() cmd = "/data3/wk/MPTopo/src/tmp_plot_histogram_logscale.sh %s" os.system(cmd%(outfile)) except IOError: return 1
def main(): #{{{ numArgv = len(sys.argv) if numArgv < 2: PrintHelp() return 1 parameters = {} parameters['minGapFraction'] = 0.5 parameters['maxGapFraction'] = 1.0 parameters['minDGvalue'] = -999999.0 parameters['maxDGvalue'] = 1.0 infile = "" outfile = "" isQuiet = False i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = sys.argv[i] isNonOptionArg = False i += 1 elif sys.argv[i] == "--": isNonOptionArg = True i += 1 elif sys.argv[i][0] == "-": if sys.argv[i] == "-h" or sys.argv[i] == "--help": PrintHelp() sys.exit() elif (sys.argv[i] == '-o' or sys.argv[i] == '--o' or sys.argv[i] == "-outfile" or sys.argv[i] == "--outfile"): outfile = sys.argv[i + 1] i += 2 elif sys.argv[i] == "-gap" or sys.argv[i] == "--gap": parameters['minGapFraction'] = float(sys.argv[i + 1]) i += 2 elif sys.argv[i] == "-dg" or sys.argv[i] == "--dg": parameters['maxDGvalue'] = float(sys.argv[i + 1]) i += 2 elif sys.argv[i] == "-q": isQuiet = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", sys.argv[i] return -1 else: infile = sys.argv[i] i += 1 if infile == "": print >> sys.stderr, "infile not set. Exit." return -1 elif not os.path.exists(infile): print >> sys.stderr, "infile %s does not exists. Exit." % infile rootname = os.path.basename(os.path.splitext(infile)[0]) fpout = sys.stdout if outfile != "": try: fpout = open(outfile, "w") except IOError: print >> sys.stderr, "Failed to write to file %s." % outfile print >> sys.stderr, "Reset output to sys.stdout." fpout = sys.stdout pass fpin = open(infile, "rb") if not fpin: print >> sys.stderr, "Failed to open input file %s" % (infile) return -1 unprocessedBuffer = "" cntTotalOutputRecord = 0 cntTotalReadInRecord = 0 isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff pairCmpRecordList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer( buff, pairCmpRecordList) if len(pairCmpRecordList) > 0: #WritePairCmpRecord(pairCmpRecordList,fpout); filteredList = FilterPairCmpResult(pairCmpRecordList, parameters) (status, cntTotalOutputRecord) = lcmp.WritePairCmpRecord( filteredList, cntTotalOutputRecord, fpout) cntTotalReadInRecord += len(pairCmpRecordList) if isEOFreached == True: break fpin.close() print "cntTotalReadInRecord =", cntTotalReadInRecord print "cntTotalOutputRecord =", cntTotalOutputRecord if fpout != None and fpout != sys.stdout: fpout.close() return 0
def main(g_params): #{{{ argv = sys.argv numArgv = len(sys.argv) if numArgv < 2: PrintHelp() return 1 infile = "" outpath = "./" isQuiet = False tableinfoFile = "" cmpclassList = [] restrictIDListFile = "" signalpFile = "" dupFile = "" outfile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = sys.argv[i] isNonOptionArg = False i += 1 elif sys.argv[i] == "--": isNonOptionArg = True i += 1 elif sys.argv[i][0] == "-": if sys.argv[i] in ["-h", "--help"]: PrintHelp() sys.exit() elif argv[i] in ["-o", "--o"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-cmpclass", "--cmpclass"]: (tmpstr, i) = myfunc.my_getopt_str(argv, i) cmpclassList.append(tmpstr) elif argv[i] in ["-signalp", "--signalp"]: (signalpFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-restrictidlist", "--restrictidlist"]: (restrictIDListFile, i) = myfunc.my_getopt_str(argv, i) g_params['isRestrictIDListSet'] = True elif argv[i] in ["-dup", "--dup", "-dupfile", "--dupfile"]: (dupFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-rmsp", "--rmsp"]: g_params['isRemoveSignalP'] = True i += 1 elif argv[i] in ["-rmdup", "--rmdup"]: g_params['isRemoveDup'] = True i += 1 elif argv[i] in ["-seq2fammap", "--seq2fammap"]: (seq2famMapfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqidttype", "--seqidttype"]: g_params['seqidttype'], i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-tableinfo", "--tableinfo"]: tableinfoFile, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-min-seqidt", "--min-seqidt"]: g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-max-seqidt", "--max-seqidt"]: g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-evodist", "--evodist"]: g_params['isEvodist'] = True i += 1 elif argv[i] in ["-alignrange", "--alignrange"]: g_params['alignrange'], i = myfunc.my_getopt_str(argv, i) if not g_params['alignrange'] in ['all', 'full', 'part']: print >> sys.stderr, "alignrange must be one of [all, full, part]" return 1 else: if g_params['alignrange'] == 'full': g_params['alignrange'] = 'FULL_ALIGNED' elif g_params['alignrange'] == 'part': g_params['alignrange'] = 'PART_ALIGNED' elif argv[i] in ["-debug", "--debug"]: if argv[i + 1][0].lower() == 'y': g_params['isDEBUG'] = True else: g_params['isDEBUG'] = False i += 2 elif argv[i] in [ "-debug-unmapped-position", "--debug-unmapped-position" ]: DEBUG_UNMAPPED_TM_POSITION = 1 i += 2 elif sys.argv[i] == "-q": isQuiet = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", sys.argv[i] return -1 else: infile = sys.argv[i] i += 1 if infile == "": print >> sys.stderr, "infile not set. Exit." return -1 elif not os.path.exists(infile): print >> sys.stderr, "infile %s does not exists. Exit." % infile try: fpin = open(infile, "rb") except IOError: print >> sys.stderr, "Failed to open input file %s" % (infile) return -1 pairalnStat = {} if g_params['seqidttype'] != 0: if tableinfoFile == "" or not os.path.exists(tableinfoFile): print >> sys.stderr, "tableinfoFile must be set when seqidttype is set to 1 or 2" print >> sys.stderr, "but seqidttype = %d is set. Exit." % g_params[ 'seqidttype'] return -1 pairalnStat = lcmp.ReadPairAlnTableInfo(tableinfoFile) rootname = os.path.basename(os.path.splitext(infile)[0]) binpath = os.path.dirname(sys.argv[0]) signalpDict = {} if signalpFile != "": signalpDict = lcmp.ReadSignalPDict(signalpFile) if signalpDict != {}: g_params['isSignalPSet'] = True dupPairList = [] if dupFile != "": dupPairList = lcmp.ReadDupPairList(dupFile) if len(dupPairList) > 0: g_params['isDupSet'] = True dupPairSet = set(dupPairList) restrictIDSet = set([]) if restrictIDListFile != "": restrictIDSet = set(myfunc.ReadIDList(restrictIDListFile)) rltyDict = {} fpout = myfunc.myopen(outfile, sys.stdout, "w", False) unprocessedBuffer = "" cntTotalReadInRecord = 0 cntTotalOutputRecord = 0 isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff pairCmpRecordList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer( buff, pairCmpRecordList) AddTableInfo(pairCmpRecordList, pairalnStat) AddSignalPInfo(pairCmpRecordList, signalpDict) AddDupInfo(pairCmpRecordList, dupPairSet) cntTotalReadInRecord += len(pairCmpRecordList) pairCmpRecordList = FilterPairCmpResult(pairCmpRecordList, cmpclassList, rltyDict, restrictIDSet) if len(pairCmpRecordList) > 0: lcmp.WritePairCmpRecord(pairCmpRecordList, cntTotalOutputRecord, fpout) cntTotalOutputRecord += len(pairCmpRecordList) if isEOFreached == True: break fpin.close() print "cntTotalReadInRecord =", cntTotalReadInRecord print "cntTotalOutputRecord =", cntTotalOutputRecord myfunc.myclose(fpout) return 0
def Ana_NumTMHeatMap( infile, seqid2pfamidDict, seqid2clanidDict, #{{{ tm_pfamidSet, tm_clanidSet, pfamidDefDict, clanidDefDict, signalpDict, classList_TableNumTMHeatMap, SPE_PAIR_LIST, pfamid2seqidDict, clanid2seqidDict, idSet_TMpro, usedPfamIDSet, alignrange): dataTableNumTMHeatMap = {} InitTableNumTMHeatMap(dataTableNumTMHeatMap, classList_TableNumTMHeatMap, 100, SPE_PAIR_LIST) pairInfoListDict = {} for cls in classList_TableNumTMHeatMap: pairInfoListDict[cls] = [] if g_params['outpath'] != "": outpath = g_params['outpath'] else: outpath = os.path.dirname(infile) if outpath == "": outpath = "." try: unprocessedBuffer = "" cntTotalReadInRecord = 0 cntTotalOutputRecord = 0 isEOFreached = False fpin = open(infile) while 1: buff = fpin.read(BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff pairCmpRecordList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer( buff, pairCmpRecordList) if len(pairCmpRecordList) > 0: #CountSpecialPair(pairCmpRecordList, pairInfoLists, SPE_PAIR_LIST) AnaPairCmpResultNumTMHeatMap(pairCmpRecordList, dataTableNumTMHeatMap, pairInfoListDict, classList_TableNumTMHeatMap, signalpDict, SPE_PAIR_LIST, alignrange) cntTotalReadInRecord += len(pairCmpRecordList) print "cntTotalReadInRecord = ", cntTotalReadInRecord if isEOFreached == True: break fpin.close() #print "count 6,12 = ", dataTableNumTMHeatMap['RMSP']['data'][6][12] #print "count = ", dataTableNumTMHeatMap['RMSP']['data'] #print "count special", dataTableNumTMHeatMap['RMSP']['pairInfoLists'] try: for i in xrange(len( dataTableNumTMHeatMap['RMSP']['pairInfoLists'])): print SPE_PAIR_LIST[i], len( dataTableNumTMHeatMap['RMSP']['pairInfoLists'][i]) except KeyError: pass # if g_params['numTMHeatMapMode'] == "full": # FillSymmetricDataTableNumTMHeatMap(dataTableNumTMHeatMap, # classList_TableNumTMHeatMap) for cls in classList_TableNumTMHeatMap: # ["ALL", "RMSP"] (freqListPfam, freqListClan) = AnaFamFrequency_onelist( pairInfoListDict[cls], seqid2pfamidDict, seqid2clanidDict, pfamid2seqidDict, clanid2seqidDict, tm_pfamidSet, tm_clanidSet, idSet_TMpro, usedPfamIDSet) #print "tm_clanidSet", tm_clanidSet if g_params['pairwise_comparison_method'] == 1: cmpclassList = cmpClassList_method1 elif g_params['pairwise_comparison_method'] == 3: cmpclassList = cmpClassList_method3 isCmpDup = False outFileFamPairCount = "%s%s%s.%s.%s.pfam.paircount.txt" % ( outpath, os.sep, g_params['outname'], alignrange, cls) WriteFamPairCount(freqListPfam, pairInfoListDict[cls], pfamidDefDict, cmpclassList, g_params['pairwise_comparison_method'], isCmpDup, outFileFamPairCount) outFileFamPairCount = "%s%s%s.%s.%s.clan.paircount.txt" % ( outpath, os.sep, g_params['outname'], alignrange, cls) WriteFamPairCount(freqListClan, pairInfoListDict[cls], clanidDefDict, cmpclassList, g_params['pairwise_comparison_method'], isCmpDup, outFileFamPairCount) if g_params['pairwise_comparison_method'] == 3: # if mp=3, write another statistics with cmpdup isCmpDup = True cmpclassList = cmpClassList_mp3_cmpdup outFileFamPairCount = "%s%s%s.%s.%s.cmpdup.pfam.paircount.txt" % ( outpath, os.sep, g_params['outname'], alignrange, cls) WriteFamPairCount(freqListPfam, pairInfoListDict[cls], pfamidDefDict, cmpclassList, g_params['pairwise_comparison_method'], isCmpDup, outFileFamPairCount) outFileFamPairCount = "%s%s%s.%s.%s.cmpdup.clan.paircount.txt" % ( outpath, os.sep, g_params['outname'], alignrange, cls) WriteFamPairCount(freqListClan, pairInfoListDict[cls], clanidDefDict, cmpclassList, g_params['pairwise_comparison_method'], isCmpDup, outFileFamPairCount) for mode_norm in ["norm_diag", "no_norm"]: if mode_norm in ["norm_diag", "no_norm"]: heatmapmode = 'half' else: heatmapmode = 'full' outFileNumTMHeatMap = "%s%s%s.%s.%s.%s.%s.txt" % ( outpath, os.sep, g_params['outname'], alignrange, heatmapmode, cls, mode_norm) if heatmapmode == 'full': mtx = myfunc.FillSymmetricMatrix( dataTableNumTMHeatMap[cls]['data'], dataTableNumTMHeatMap[cls]['maxNumTM']) else: mtx = dataTableNumTMHeatMap[cls]['data'] if mode_norm == "no_norm": for i in range(dataTableNumTMHeatMap[cls]['maxNumTM']): mtx[i][i] = 0 if WriteNumTMHeatMap(mtx, dataTableNumTMHeatMap[cls]['maxNumTM'], dataTableNumTMHeatMap[cls]['numPair'], mode_norm, outFileNumTMHeatMap) == 0: print "heatmap %s output" % (outFileNumTMHeatMap) cmd = "%s/plotNumTMHeatMap.sh %s" % (binpath, outFileNumTMHeatMap) os.system(cmd) outFileSpecialPairAna = "%s%s%s.%s.%s.%s.%s.specialpairana.txt" % ( outpath, os.sep, g_params['outname'], alignrange, g_params['numTMHeatMapMode'], cls, mode_norm) WriteSpecialPair(dataTableNumTMHeatMap[cls], pairInfoListDict[cls], seqid2pfamidDict, seqid2clanidDict, tm_pfamidSet, tm_clanidSet, pfamidDefDict, clanidDefDict, SPE_PAIR_LIST, outFileSpecialPairAna) print "Anafile %s output" % (outFileSpecialPairAna) # for i in xrange(len(SPE_PAIR_LIST)): # print # pair = SPE_PAIR_LIST[i] # print pair # print len(pairInfoLists[i]) # print pairInfoLists[i] except IOError: return 1
def main():#{{{ numArgv=len(sys.argv) if numArgv < 2: PrintHelp() return 1; parameters={}; parameters['minGapFraction'] = 0.5; parameters['maxGapFraction'] = 1.0; parameters['minDGvalue'] = -999999.0; parameters['maxDGvalue'] = 1.0; parameters['minSeqIDT'] = 0.0; parameters['maxSeqIDT'] = 100.0; infile=""; outfile=""; outPaircmpfile = ""; pairalnTopoFile = ""; isQuiet=False; i = 1; isNonOptionArg=False while i < numArgv: if isNonOptionArg == True: infile=sys.argv[i]; isNonOptionArg=False; i += 1; elif sys.argv[i] == "--": isNonOptionArg=True; i += 1; elif sys.argv[i][0] == "-": if sys.argv[i] == "-h" or sys.argv[i] == "--help": PrintHelp(); sys.exit(); elif (sys.argv[i] == '-o' or sys.argv[i] == '--o' or sys.argv[i] == "-outfile" or sys.argv[i] == "--outfile"): outfile=sys.argv[i+1]; i += 2; elif sys.argv[i] == "-gap" or sys.argv[i] == "--gap": parameters['minGapFraction'] = float(sys.argv[i+1]); i += 2; elif sys.argv[i] == "-dg" or sys.argv[i] == "--dg": parameters['maxDGvalue'] = float(sys.argv[i+1]); i += 2; elif sys.argv[i] in ["-min-seqidt", "--min-seqidt"]: parameters['minSeqIDT'] = float(sys.argv[i+1]); i += 2; elif sys.argv[i] in ["-max-seqidt", "--max-seqidt"]: parameters['maxSeqIDT'] = float(sys.argv[i+1]); i += 2; elif sys.argv[i] in ["-write-paircmp", "--write-paircmp"]: outPaircmpfile = sys.argv[i+1]; i += 2; elif sys.argv[i] in ["-aln", "--aln"]: pairalnTopoFile = sys.argv[i+1]; i += 2; elif sys.argv[i] == "-q": isQuiet=True; i += 1; else: print >> sys.stderr, "Error! Wrong argument:", sys.argv[i]; return -1; else: infile=sys.argv[i]; i += 1 if infile == "": print >> sys.stderr, "infile not set. Exit."; return -1; elif not os.path.exists(infile): print >> sys.stderr, "infile %s does not exists. Exit."%infile; if pairalnTopoFile == "": print >> sys.stderr, "pairalnTopoFile not set. Exit."; return -1; pairTopoAlnDict = GetPairTopoAln(pairalnTopoFile); # pairTopoAlnDict[id1-id2]['id1] ['id2'] ['anno1'] ['anno2] ['seq1'] # ['seq2'] rootname=os.path.basename(os.path.splitext(infile)[0]); fpout = sys.stdout; fppaircmp = None; if outPaircmpfile != "" : fppaircmp = open(outPaircmpfile, "w") if outfile != "": try: fpout = open(outfile,"w"); except IOError: print >>sys.stderr, "Failed to write to file %s."%outfile; print >> sys.stderr, "Reset output to sys.stdout."; fpout = sys.stdout; pass; fpin = open (infile, "rb"); if not fpin: print >> sys.stderr, "Failed to open input file %s"%(infile); return -1; unprocessedBuffer=""; cntTotalOutputRecord = 0; cntTotalReadInRecord = 0; isEOFreached = False; while 1: buff = fpin.read(BLOCK_SIZE); if buff == "": isEOFreached = True; buff = unprocessedBuffer + buff; pairCmpRecordList=[]; unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff,pairCmpRecordList); if len(pairCmpRecordList) > 0: #WritePairCmpRecord(pairCmpRecordList,fpout); filteredList = FilterPairCmpResult(pairCmpRecordList, parameters); for record in filteredList: if IsHasInternalVariation(record): if fppaircmp != None: li = []; li.append(record); (status, cntTotalOutputRecord ) = lcmp.WritePairCmpRecord(li, cntTotalOutputRecord, fppaircmp); key = "%s-%s"%(record['id1'], record['id2']); pair = pairTopoAlnDict[key]; fpout.write(">%s\n"%pair['anno1']); fpout.write("%s\n"%pair['seq1']); fpout.write(">%s\n"%pair['anno2']); fpout.write("%s\n"%pair['seq2']); cntTotalReadInRecord += len(pairCmpRecordList); if isEOFreached == True: break; fpin.close(); print "cntTotalReadInRecord =", cntTotalReadInRecord; print "cntTotalOutputRecord =", cntTotalOutputRecord; if fpout != None and fpout != sys.stdout: fpout.close(); if fppaircmp != None: fppaircmp.close(); return 0;
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "./" dupfile = "" paircmpfile = "" outfile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outpath", "--outpath"]: outpath = argv[i + 1] i += 2 elif argv[i] in ["-o", "--o"]: outfile = argv[i + 1] i += 2 elif argv[i] in ["-l", "--l"]: idListFile = argv[i + 1] i += 2 elif argv[i] in ["-dup", "--dup"]: dupfile = argv[i + 1] i += 2 elif argv[i] in ["-paircmp", "--paircmp"]: paircmpfile = argv[i + 1] i += 2 elif argv[i] in ["-q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 if paircmpfile == "": return 1 if dupfile == "": return 1 dupPairList = ReadDupPairList(dupfile) dupPairSet = set(dupPairList) fpout = myfunc.myopen(outfile, sys.stdout, "w", False) fpin = open(paircmpfile, "r") unprocessedBuffer = "" cntTotalReadInRecord = 0 cntTotalOutputRecord = 0 isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff pairCmpRecordList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer( buff, pairCmpRecordList) if len(pairCmpRecordList) > 0: StatDupPaircmp(pairCmpRecordList, dupPairSet, fpout) cntTotalReadInRecord += len(pairCmpRecordList) if isEOFreached == True: break fpin.close() myfunc.myclose(fpout)
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 i = 1 isNonOptionArg = False isPickOne = False paircmpFile = "" pfamACDEListFile = '/data3/data/pfam/pfamA.seed.ac-delist' seqDefFile = '/data3/wk/MPTopo/pfamAna/pfam2-selTM-giid-refseqid-pfamid-description.txt' outpath = "" htmlname = 'index' tableinfoFile = "" while i < numArgv: #{{{ if isNonOptionArg == True: paircmpFile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] == "-h" or argv[i] == "--help": PrintHelp() return 1 elif argv[i] == "-outpath" or argv[i] == "--outpath": outpath = argv[i + 1] i += 2 elif argv[i] == "-htmlname" or argv[i] == "--htmlname": htmlname = argv[i + 1] i += 2 elif argv[i] == "-alnfigpath" or argv[i] == "--alnfigpath": g_params['alnFigPath'] = argv[i + 1] i += 2 elif argv[i] == "-pfamdef" or argv[i] == "--pfamdef": pfamACDEListFile = argv[i + 1] i += 2 elif argv[i] == "-msapath" or argv[i] == "--msapath": g_params['MSAPath'] = argv[i + 1] i += 2 elif argv[i] == "-msapath2" or argv[i] == "--msapath2": g_params['MSAPath2'] = argv[i + 1] i += 2 elif argv[i] == "-tableinfo" or argv[i] == "--tableinfo": tableinfoFile = argv[i + 1] i += 2 elif argv[i] == "-seqdef" or argv[i] == "--seqdef": seqDefFile = argv[i + 1] i += 2 elif argv[i] == "-gap" or argv[i] == "--gap": g_params['minGapFraction'] = float(argv[i + 1]) i += 2 elif argv[i] == "-dg" or argv[i] == "--dg": g_params['maxDGvalue'] = float(argv[i + 1]) i += 2 elif argv[i] in ["-min-seqidt", "--min-seqidt"]: g_params['minSeqIDT'] = float(argv[i + 1]) i += 2 elif argv[i] in ["-max-seqidt", "--max-seqidt"]: g_params['maxSeqIDT'] = float(argv[i + 1]) i += 2 elif argv[i] in ["-tableformat", "--tableformat"]: g_params['htmltableformat'] = int(argv[i + 1]) i += 2 elif argv[i] in ["-treepath", "--treepath"]: g_params['treepath'] = argv[i + 1] i += 2 elif argv[i] in ["-ordermsapath", "--ordermsapath"]: g_params['ordermsapath'] = argv[i + 1] i += 2 elif argv[i] in ["-topomsapath", "--topomsapath"]: g_params['topomsapath'] = argv[i + 1] i += 2 elif argv[i] in ["-type", "--type"]: g_params['selecttype'] = argv[i + 1] i += 2 elif argv[i] in ["-filter-predseq", "--filter-predseq"]: if argv[i + 1].lower()[0] == 'y': g_params['isFilterPredictedSeq'] = True else: g_params['isFilterPredictedSeq'] = False i += 2 elif argv[i] == "-q": isQuiet = True i += 1 elif argv[i] in ["-pickone", "--pickone"]: isPickOne = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: paircmpFile = argv[i] i += 1 #}}} g_params['outpath'] = outpath if not os.path.exists(outpath): os.system("mkdir -p %s" % outpath) # read paircmprecordlist fpin = open(paircmpFile, 'r') buff = fpin.read() fpin.close() recordList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, recordList) print "len(recordList) =", len(recordList) seqIDListSet = set([]) for record in recordList: seqIDListSet.add(record['id1']) seqIDListSet.add(record['id2']) print "len(seqIDListSet) =", len(seqIDListSet) # Read In pairwise alignment info pairalnStat = {} if tableinfoFile != "" and os.path.exists(tableinfoFile): pairalnStat = ReadInTableInfo(tableinfoFile) print "len(pairalnStat) =", len(pairalnStat) #Read In pfamDefList if not os.path.exists(pfamACDEListFile): print >> sys.stderr, "Error! file pfamACDEListFile (%s) does not exist." % pfamACDEListFile return 1 pfamDefDict = ReadPfamDEList(pfamACDEListFile) print 'len(pfamDefDict)=', len(pfamDefDict) #Read in seqinfoList seqInfoDict = {} if not os.path.exists(seqDefFile): print >> sys.stderr, "Error! file seqDefFile (%s) does not exist." % seqDefFile return 1 fpin = open(seqDefFile, "r") line = fpin.readline() line = fpin.readline() while line: strs = line.split('|') if len(strs) == 4: gid = strs[0].strip() if gid in seqIDListSet: refseqid = strs[1].strip() pfamid = strs[2].strip() seqdef = strs[3].strip() seqInfoDict[gid] = {} seqInfoDict[gid]['pfamid'] = pfamid seqInfoDict[gid]['refseqid'] = refseqid seqInfoDict[gid]['seqdef'] = seqdef seqInfoDict[gid]['pfamdef'] = pfamDefDict[pfamid] line = fpin.readline() fpin.close() print 'len(seqInfoDict)=', len(seqInfoDict) # add tableinfo to record list print "Add pairwise alignment table info to record..." AddTableInfo(recordList, pairalnStat) print "Add seqdef to record ..." AddSeqDefInfo(recordList, seqInfoDict) filteredRecordList = FilterPairCmpResult(recordList) del recordList numFilteredRecordList = len(filteredRecordList) print "numFilteredRecordList = %d" % numFilteredRecordList # reorder list according to pfamid numPair = len(filteredRecordList) tupList = [] # list of (index - pfamid) for i in xrange(numPair): thisPfamid = seqInfoDict[filteredRecordList[i]['id1']]['pfamid'] tupList.append((i, thisPfamid)) sorted_by_pfamid = sorted(tupList, key=lambda tup: tup[1]) pairCmpRecordList = [] for i in xrange(numPair): pairCmpRecordList.append(filteredRecordList[sorted_by_pfamid[i][0]]) if isPickOne: pairCmpRecordList = PickOnlyOneForEachPfam(pairCmpRecordList, sorted_by_pfamid) print "len(pairCmpRecordList) = ", len(pairCmpRecordList) WriteHTML(pairCmpRecordList, seqInfoDict, htmlname, outpath) return 0
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outpath = "" pairListFile = "" seqlenFile = "" shortid2fullidFile = "" seqid2pfamidMapFile = "" pfamDefFile = '/data3/data/pfam/pfam27.0/Pfam-A.clans.tsv' topodb = "" seqdb = "" pdb2spFile = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: isNonOptionArg = False i += 1 return 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-outpath", "--outpath"]: (outpath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-topodb", "--topodb"]: (topodb, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pdb2sp", "-pdb2sp", "-pdbtosp", "--pdbtosp"]: (pdb2spFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqdb", "--seqdb"]: (seqdb, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqmsapath", "--seqmsapath"]: (g_params['seqmsapath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-datapath", "--datapath"]: (g_params['datapath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seq2pfam", "--seq2pfam"]: (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfam2seq", "--pfam2seq"]: (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-description", "--description"]: (g_params['description'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamdef", "--pfamdef"]: (pfamDefFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-alignrange", "--alignrange"]: g_params['alignrange'], i = myfunc.my_getopt_str(argv, i) if not g_params['alignrange'] in ['all', 'full', 'part']: print >> sys.stderr, "alignrange must be one of [all, full, part]" return 1 else: if g_params['alignrange'] == 'full': g_params['alignrange'] = 'FULL_ALIGNED' elif g_params['alignrange'] == 'part': g_params['alignrange'] = 'PART_ALIGNED' elif argv[i] in ["-basename", "--basename"]: (g_params['basename'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-treepath", "--treepath"]: (g_params['treepath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairalnpath", "--pairalnpath"]: (g_params['pairalnpath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-maxperfamily", "--maxperfamily"]: (g_params['max_num_output_per_family'], i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-min-seqidt", "--min-seqidt"]: g_params['minSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-max-seqidt", "--max-seqidt"]: g_params['maxSeqIDT'], i = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-shortid2fullid", "--shortid2fullid"]: (shortid2fullidFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-debug", "--debug"]: if argv[i + 1][0].lower() == 'y': g_params['isDEBUG'] = True else: g_params['isDEBUG'] = False i += 2 elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 if g_params['basename'] == "": print >> sys.stderr, "basename not set. exit" return 1 if myfunc.checkfile(g_params['datapath'], "datapath") != 0: return 1 if myfunc.checkfile(seqid2pfamidMapFile, "seqid2pfamidMapFile") != 0: return 1 if myfunc.checkfile(pfamid2seqidMapFile, "pfamid2seqidMapFile") != 0: return 1 if myfunc.checkfile(topodb + "0.db", "topodb") != 0: return 1 if myfunc.checkfile(seqdb + "0.db", "seqdb") != 0: return 1 if myfunc.checkfile(g_params['seqmsapath'], "seqmsapath") != 0: return 1 if pdb2spFile != "": (g_params['pdb2uniprotMap'], g_params['uniprot2pdbMap']) = myfunc.ReadPDBTOSP(pdb2spFile) if g_params['datapath'] == "": print >> sys.stderr, "datapath not set" return 1 elif not os.path.exists(g_params['datapath']): print >> sys.stderr, "datapath %s does not exist" % ( g_params['datapath']) return 1 if outpath == "": print >> sys.stderr, "outpath not set" return 1 elif not os.path.exists(outpath): cmd = ["mkdir", "-p", outpath] subprocess.check_call(cmd) paircmpfile = "%s/%s.paircmp" % (g_params['datapath'], g_params['basename']) if myfunc.checkfile(paircmpfile, "paircmpfile") != 0: return 1 (g_params['pfamidDefDict'], g_params['clanidDefDict']) = lcmp.ReadPfamDefFile(pfamDefFile) g_params['seqid2pfamidDict'] = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile) g_params['pfamid2seqidDict'] = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile) tmpdir = tempfile.mkdtemp() if g_params['msapath'] == "": g_params['msapath'] = tmpdir if g_params['treepath'] == "": g_params['treepath'] = tmpdir if g_params['pairalnpath'] == "": g_params['pairalnpath'] = tmpdir pairCmpRecordList = [] unprocessedBuffer = "" cntTotalReadInRecord = 0 cntTotalOutputRecord = 0 isEOFreached = False try: fpin = open(paircmpfile, "r") except IOError: print >> sys.stderr, "Failed to open input file %s" % (paircmpfile) return 1 while 1: buff = fpin.read(myfunc.BLOCK_SIZE) if buff == "": isEOFreached = True buff = unprocessedBuffer + buff rdList = [] unprocessedBuffer = lcmp.ReadPairCmpResultFromBuffer(buff, rdList) rdList = FilterPairCmpResult(rdList) cntTotalReadInRecord += len(rdList) pairCmpRecordList += rdList if isEOFreached == True: break fpin.close() print "cntTotalReadInRecord =", cntTotalReadInRecord g_params['hdl_seqdb'] = myfunc.MyDB(seqdb) g_params['hdl_topodb'] = myfunc.MyDB(topodb) g_params['OS'] = os.uname()[0] if g_params['OS'].find('Linux') != -1: g_params['CP_EXE'] = "/bin/cp -uf" else: g_params['CP_EXE'] = "/bin/cp -f" if shortid2fullidFile != "": g_params['uniprotAC2FullSeqIDMap'] = myfunc.ReadID2IDMap( shortid2fullidFile) addname = "" if g_params['alignrange'] != 'all': addname += ".%s" % (g_params['alignrange']) dataTable = {} # structure of dataTable # dataTable[pfamid] = {'set_seqid':set(), 'difftopopair':[{'INV':[(id1,id2)]},{'TM2GAP':},{}} # first read in pairCmpRecordList AddAllSeqInPairCmp(dataTable, pairCmpRecordList, g_params['seqid2pfamidDict']) pairInfoFileList = [] for cmpclass in g_params['cmpClassList_mp3_cmpdup'][0:]: ss = "%s/%s_.cmpdup.FULL_ALIGNED.%s.pairinfo.txt" % ( g_params['datapath'], g_params['basename'], cmpclass) pairInfoFileList.append(ss) pairinfoList = ReadPairInfo_cmpclass(ss) AddPairInfo(dataTable, pairinfoList, cmpclass) # print "\n".join(pairInfoFileList) if g_params['isDEBUG']: #{{{ for pfamid in dataTable: print pfamid print "\tset_seqid" print dataTable[pfamid]['set_seqid'] print "\tdifftopopair" for cls in dataTable[pfamid]['difftopopair']: print "\t\t", cls for tup in dataTable[pfamid]['difftopopair'][cls]: print "\t\t\t", tup #}}} WriteHTML(dataTable, outpath) os.system("rm -rf %s" % (tmpdir))