def RemoveDupSeq(infile, g_outpath, method, isUseMD5):#{{{ if g_outpath == "": outpath = myfunc.my_dirname(infile) else: outpath = g_outpath rootname = os.path.basename(os.path.splitext(infile)[0]) outfile = "%s%s%s"%(outpath, os.sep, rootname) fpout = myfunc.myopen(outfile, None, "w", False) if fpout == None: return 1 hdl = myfunc.ReadFastaByBlock(infile) if hdl.failure: return -1 myset = set([]) recordList = hdl.readseq() while recordList != None: for rd in recordList: if method == "id": key = rd.seqid elif method == "seq": if isUseMD5: key = md5.new(rd.seq).digest() else: key = rd.seq if not key in myset: myset.add(key) fpout.write(">%s\n%s\n"%(rd.description, rd.seq)) recordList = hdl.readseq() hdl.close() myfunc.myclose(fpout) return 0
def RemoveDupSeq(infile, g_outpath, method, isUseMD5): #{{{ if g_outpath == "": outpath = myfunc.my_dirname(infile) else: outpath = g_outpath rootname = os.path.basename(os.path.splitext(infile)[0]) outfile = "%s%s%s" % (outpath, os.sep, rootname) fpout = myfunc.myopen(outfile, None, "w", False) if fpout == None: return 1 hdl = myfunc.ReadFastaByBlock(infile) if hdl.failure: return -1 myset = set([]) recordList = hdl.readseq() while recordList != None: for rd in recordList: if method == "id": key = rd.seqid elif method == "seq": if isUseMD5: key = md5.new(rd.seq).digest() else: key = rd.seq if not key in myset: myset.add(key) fpout.write(">%s\n%s\n" % (rd.description, rd.seq)) recordList = hdl.readseq() hdl.close() myfunc.myclose(fpout) return 0
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 SPE_PAIR_LIST = [(2, 1), (2, 4), (2, 6), (2, 8), (3, 6), (3, 7), (4, 6), (4, 8), (4, 10), (5, 7), (5, 10), (6, 8), (6, 10), (6, 12), (7, 14), (8, 10), (8, 12), (10, 12), (10, 13), (11, 13), (12, 14)] outfile = "" infile = "" pfamDefFile = "%s/data/pfam/pfam26.0/Pfam-A.clans.tsv" % (DATADIR3) signalpFile = "%s/wk/MPTopo/pfamAna_refpro/pred_signalp/refpro20120604-celluar.selmaxlength-m1.nr100.signalp_list" % ( DATADIR3) #seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2clanid"%(DATADIR3) #seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.seqid2pfamid"%(DATADIR3) seqid2clanidMapFile = "" seqid2pfamidMapFile = "" tm_pfamidListFile = "" tm_clanidListFile = "" pfamid2seqidMapFile = "" clanid2seqidMapFile = "" dbname_predTM = "" pairlistwithpfamidFile = "" pfamtype = "" pairListFile = "" #classList_TableNumTMHeatMap = ["ALL", "RMSP"] classList_TableNumTMHeatMap = ["ALL"] i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: infile = argv[i] isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-outpath", "--outpath"]: (g_params['outpath'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"]: (fileListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamdef", "--pfamdef"]: (pfamDefFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-signalp", "--signalp"]: (signalpFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-mp", "--mp"]: g_params[ 'pairwise_comparison_method'], i = myfunc.my_getopt_int( argv, i) elif argv[i] in ["-mindiffpair", "--mindiffpair"]: g_params['mindiffpair'], i = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-pfamtype", "--pfamtype"]: pfamtype, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-clanidlist", "--clanidlist"]: (tm_clanidListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamidlist", "--pfamidlist"]: (tm_pfamidListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqid2clanid", "--seqid2clanid"]: (seqid2clanidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqid2pfamid", "--seqid2pfamid"]: (seqid2pfamidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pfamid2seqid", "--pfamid2seqid"]: (pfamid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-clanid2seqid", "--clanid2seqid"]: (clanid2seqidMapFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairlistwithpfamid", "--pairlistwithpfamid"]: (pairlistwithpfamidFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-predTMdbname", "--predTMdbname"]: (dbname_predTM, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-pairlist", "--pairlist"]: (pairListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-winsize", "--winsize"]: (g_params['winsize'], i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-outname", "--outname"]: (g_params['outname'], i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 elif argv[i] in ["-prokar", "--prokar"]: g_params['isOnlyAnaProkar'] = True i += 1 elif argv[i] in ["-eukar", "--eukar"]: g_params['isOnlyAnaEukar'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: infile = argv[i] i += 1 if myfunc.checkfile( infile, "%s (line %d): infile" % (__file__, inspect.currentframe().f_lineno)) != 0: return 1 dirpath = myfunc.my_dirname(infile) # try to obtain Pfam family tag tag = "" if pfamtype != "": if pfamtype.upper().find("FAM") != -1: tag = ".Family" elif pfamtype.upper().find("DOM") != -1: tag = ".Domain" elif pfamtype.upper().find("REP") != -1: tag = ".Repeat" elif pfamtype.upper().find("MOT") != -1: tag = ".Motif" else: tag = "" else: if infile.find(".Family.") != -1: tag = ".Family" elif infile.find(".Domain.") != -1: tag = ".Domain" elif infile.find(".Repeat.") != -1: tag = ".Repeat" elif infile.find(".Motif.") != -1: tag = ".Motif" else: tag = "" if seqid2clanidMapFile == "": seqid2clanidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.seqid2clanid" % ( DATADIR3) if myfunc.checkfile( seqid2clanidMapFile, "%s (line %d): seqid2clanidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if seqid2pfamidMapFile == "": seqid2pfamidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.seqid2pfamid" % ( DATADIR3, tag) if myfunc.checkfile( seqid2pfamidMapFile, "%s (line %d): seqid2pfamidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if pfamid2seqidMapFile == "": pfamid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.nr100.filter.fragmented.pfamid2seqid" % ( DATADIR3) if myfunc.checkfile( pfamid2seqidMapFile, "%s (line %d): pfamid2seqidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if clanid2seqidMapFile == "": clanid2seqidMapFile = "%s/wk/MPTopo/pfamAna_refpro/pfammap_from_uniprot/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.nr100.filter.fragmented.clanid2seqid" % ( DATADIR3, tag) if myfunc.checkfile( clanid2seqidMapFile, "%s (line %d): clanid2seqidMapFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if tm_pfamidListFile == "": tm_pfamidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20%s.pfamidlist" % ( DATADIR3, tag) if myfunc.checkfile( tm_pfamidListFile, "%s (line %d): tm_pfamidListFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if tm_clanidListFile == "": tm_clanidListFile = "%s/data/pfam/pfam26.0/Pfam-A-full.seqfrompfamfasta.percentTMpro_scampi.perTM75_nseq20.clanidlist" % ( DATADIR3) if myfunc.checkfile( tm_clanidListFile, "%s (line %d): tm_clanidListFile" % (__file__, inspect.currentframe().f_lineno)): return 1 if dbname_predTM == "": dbname_predTM = "%s/wk/MPTopo/pfamAna_refpro/pred_topcons_single_method4/refpro20120604-celluar.selmaxlength-m1.topcons-single_topcons_single.m1.agree-44.RMSP" % ( DATADIR3) if myfunc.checkfile( "%s0.db" % (dbname_predTM), "%s (line %d): dbname_predTM" % (__file__, inspect.currentframe().f_lineno)): return 1 if g_params['isOnlyAnaProkar']: prokarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Prokaryota.seqidlist" % ( DATADIR3) g_params['prokarSeqIDSet'] = set(myfunc.ReadIDList(prokarseqidfile)) if len(g_params['prokarSeqIDSet']) < 1: return 1 if g_params['isOnlyAnaEukar']: eukarseqidfile = "%s/data/uniprot/reference_proteome/refpro20120604-celluar.selmaxlength-m1.nr100.filter.fragmented.Eukaryota.seqidlist" % ( DATADIR3) g_params['eukarSeqIDSet'] = set(myfunc.ReadIDList(eukarseqidfile)) if len(g_params['eukarSeqIDSet']) < 1: return 1 if pairlistwithpfamidFile == "": pairlistwithpfamidFile = "%s/../../Pfam-.maxpair100.pairlistwithpfamid" % ( dirpath) if myfunc.checkfile( pairlistwithpfamidFile, "%s (line %d): pairlistwithpfamidFile" % (__file__, inspect.currentframe().f_lineno)): return 1 pfamid_2_seqidpair_Dict = ReadPairListWithFamID(pairlistwithpfamidFile) usedPfamIDSet = set( pfamid_2_seqidpair_Dict.keys()) # pfamids used in pair selection if pairListFile != "": li = myfunc.ReadPairList(pairListFile) SPE_PAIR_LIST = [] for tup in li: SPE_PAIR_LIST.append((int(tup[0]), int(tup[1]))) (pfamidDefDict, clanidDefDict) = ReadPfamDefFile(pfamDefFile) signalpDict = lcmp.ReadSignalPDict(signalpFile) seqid2clanidDict = myfunc.ReadFam2SeqidMap(seqid2clanidMapFile) seqid2pfamidDict = myfunc.ReadFam2SeqidMap(seqid2pfamidMapFile) clanid2seqidDict = myfunc.ReadFam2SeqidMap(clanid2seqidMapFile) pfamid2seqidDict = myfunc.ReadFam2SeqidMap(pfamid2seqidMapFile) tm_pfamidList = myfunc.ReadIDList(tm_pfamidListFile) tm_clanidList = myfunc.ReadIDList(tm_clanidListFile) tm_pfamidSet = set(tm_pfamidList) tm_clanidSet = set(tm_clanidList) hdl_predTM = myfunc.MyDB(dbname_predTM) if not hdl_predTM.failure: idSet_TMpro = set(hdl_predTM.indexedIDList) else: idSet_TMpro = set([]) #classList_TableNumTMHeatMap = ["ALL", "RMSP", "RMDUP"] #alignrangeList = ['FULL_ALIGNED', 'all', 'PART_ALIGNED'] alignrangeList = ['FULL_ALIGNED'] if g_params['outpath'] != "" and not os.path.exists(g_params['outpath']): cmd = ["mkdir", "-p", g_params['outpath']] try: subprocess.check_call(cmd) except subprocess.CalledProcessError, e: print e return 1
def main(g_params): #{{{ argv = sys.argv numArgv = len(argv) if numArgv < 2: PrintHelp() return 1 outfile = "" fileListFile = "" fileList = [] pfamDefFile = "%s/data/pfam/pfam26.0/Pfam-A.clans.tsv" % (DATADIR3) threshold_Fraction_Group_2 = 0.05 threshold_NumSeq_Group_2 = 2 tableinfoFile = "" pdbtospFile = "" sprotACListFile = "" threshold_g12_seqidt = 20.0 topoalnFile = "" aapath = "" i = 1 isNonOptionArg = False while i < numArgv: if isNonOptionArg == True: fileList.append(argv[i]) isNonOptionArg = False i += 1 elif argv[i] == "--": isNonOptionArg = True i += 1 elif argv[i][0] == "-": if argv[i] in ["-h", "--help"]: PrintHelp() return 1 elif argv[i] in ["-o", "--o", "-outfile"]: (outfile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-l", "--l"]: (fileListFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-seqidttype", "--seqidttype"]: (g_params['seqidttype'], i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-tableinfo", "--tableinfo"]: (tableinfoFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-topoaln", "--topoaln"]: (topoalnFile, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-aapath", "--aapath"]: (aapath, i) = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-thncls2", "--thncls2"]: (threshold_NumSeq_Group_2, i) = myfunc.my_getopt_int(argv, i) elif argv[i] in ["-thfrac2", "--thfrac2"]: (threshold_Fraction_Group_2, i) = myfunc.my_getopt_float(argv, i) elif argv[i] in ["-pfamdef", "--pfamdef"]: (pfamDefFile, i) = myfunc.my_getopt_str(argv, i) elif (argv[i] in ["-pdbtosp", "--pdbtosp"]): pdbtospFile, i = myfunc.my_getopt_str(argv, i) elif (argv[i] in ["-sprot", "--sprot"]): sprotACListFile, i = myfunc.my_getopt_str(argv, i) elif argv[i] in ["-q", "--q"]: g_params['isQuiet'] = True i += 1 else: print >> sys.stderr, "Error! Wrong argument:", argv[i] return 1 else: fileList.append(argv[i]) i += 1 if fileListFile != "": fileList += myfunc.ReadIDList(fileListFile) if len(fileList) < 1: print >> sys.stderr, "No input set. exit" return 1 if myfunc.checkfile(topoalnFile, "topoalnFile") != 0: return 1 if myfunc.checkfile(aapath, "aapath") != 0: return 1 if outfile == "": print >> sys.stderr, "outfile not set. Exit" return 1 outpath = myfunc.my_dirname(outfile) if not os.path.exists(outpath): cmd = ["mkdir", "-p", outpath] try: subprocess.check_output(cmd) except subprocess.CalledProcessError, e: print e return 1
for tt in fracList: fpout.write(" %7.2f" % (tt * 100)) fpout.write("\n") myfunc.myclose(fpout) print "file %s output" % (outfile1) # make plot cmd = ["%s/plotMaxFracFamilyWithTopoVariation.sh" % (binpath)] + outfileList try: subprocess.check_output(cmd) except subprocess.CalledProcessError, e: print e # output selected pairlist and draw pairwise topology alignment for selected # pairs outpath = myfunc.my_dirname(outfile) ext_topoaln = ".topoaln.fa" outfile_selected_pair = outfile + ".selected.pairlistwithfamid" fpout = myfunc.myopen(outfile_selected_pair, None, "w", True) print >> fpout, "#%-6s %6s %6s %7s %15s %10s %10s %6s %4s %4s %5s %5s" % ( "seqid1", "seqid2", "seqidt", "famid", "pfamdef", "numSeqCls1", "numSeqCls2", "numSeq", "nTM1", "nTM2", "isSP", "isPDB") for li in selectedPairList: print >> fpout, "%-7s %6s %6.1f %7s %15s %10d %10d %6d %4d %4d %5d %5d" % ( li[0], li[1], li[2], li[3], li[4], li[5], li[6], li[7], li[8], li[9], li[10], li[11]) myfunc.myclose(fpout) cmd = [ "%s/selectPairaln.py" % (binpath), "-pairaln", topoalnFile, "-l", outfile_selected_pair, "-split", "-outpath", outpath, "-ext",
cntf = 0 while i < numInput: splitList = [] cntscore = 0 j = 0 while i+j < numInput: pair = inputList[i+j] j += 1 seqfilename = pair[0] numseq = pair[1] if numseq >= 2: # There should be at least two sequences to run msa rootname = os.path.basename(os.path.splitext(seqfilename)[0]) if outpath != "": cur_outpath = outpath else: cur_outpath = myfunc.my_dirname(seqfilename) gzfile = cur_outpath + os.sep + "%s.%s.mfa.gz"%(rootname, prog) msafile = cur_outpath + os.sep + "%s.%s.mfa"%(rootname, prog) if ((g_params['isGzip'] == True and not os.path.exists(gzfile)) or (g_params['isGzip'] == False and not os.path.exists(msafile))): cntscore += pair[1]**2 splitList.append(pair[0]) if cntscore > threshold: break else: print "%s already exist. Ignore"%(gzfile); if len(splitList) > 0: splitidlistfile = workdir + os.sep \ + "splitidlist.%s.%d.idlist"%(prog, cntf) splitscriptfile = workdir + os.sep \
def Build_seqid2pfamid(infile, g_outpath): #{{{ outpath = "" dirname_infile = myfunc.my_dirname(infile) if g_outpath != "": outpath = g_outpath else: outpath = dirname_infile rootname = os.path.basename(os.path.splitext(infile)[0]) domainfile = "%s/%s.domainlistperseq" % (outpath, rootname) seqid2pfamidfile = "%s/%s.seqid2pfamid" % (outpath, rootname) if os.path.exists(domainfile) and os.path.exists( seqid2pfamidfile) and not g_params['isOverwrite']: print >> sys.stderr, "result file %s and %s exist. Ignore" % ( domainfile, seqid2pfamidfile) return 1 fpout_domain = myfunc.myopen(domainfile, None, "w", True) fpout_table = myfunc.myopen(seqid2pfamidfile, None, "w", True) evalue_threshold = g_params['evalue_threshold'] hdl = myfunc.ReadLineByBlock(infile) queue = deque([]) if hdl.failure: return 1 lines = hdl.readlines() while lines != None: for line in lines: if not line or line[0] == "#": continue rd = ScanfHmmscanRecord(line) # print rd if rd == None: print >> sys.stderr, "%s: bad record. line=\"%s\"" % (infile, line) else: evalue = rd[2] if evalue <= evalue_threshold: queue.append(rd) # scan queue and output result idlist = [x[0] for x in queue] # print idlist # the top records is complete for one query if there are more than one unique seqids idlist_unique = myfunc.uniquelist(idlist) idlist_complete = idlist_unique[:-1] # remove the last item idset_complete = set(idlist_complete) if len(idset_complete) > 0: recordDict = {} cnt_used_rd = 0 for i in xrange(len(queue)): seqid = queue[i][0] if not seqid in idset_complete: continue if not seqid in recordDict: recordDict[seqid] = [] recordDict[seqid].append(queue[i]) cnt_used_rd += 1 #output for seqid in idlist_complete: try: li = recordDict[seqid] except KeyError: print "seqid=%s" % ( seqid), "idlist_complete=", idlist_complete raise famidlist = [x[1] for x in li] famidlist = myfunc.uniquelist(famidlist) fpout_table.write("%s %d" % (seqid, len(famidlist))) for pfamid in famidlist: fpout_table.write(" %s" % (pfamid)) fpout_table.write("\n") fpout_domain.write("%s %d" % (seqid, len(li))) for rd in li: fpout_domain.write(" %s,%d,%d" % (rd[1], rd[5], rd[6])) fpout_domain.write("\n") # pop up queue for i in xrange(cnt_used_rd): queue.popleft() lines = hdl.readlines() if len(queue) > 0: # output the last item seqid = queue[0][0] li = queue famidlist = [x[1] for x in li] famidlist = myfunc.uniquelist(famidlist) fpout_table.write("%s %d" % (seqid, len(famidlist))) for pfamid in famidlist: fpout_table.write(" %s" % (pfamid)) fpout_table.write("\n") fpout_domain.write("%s %d" % (seqid, len(li))) for rd in li: fpout_domain.write(" %s,%d,%d" % (rd[1], rd[5], rd[6])) fpout_domain.write("\n") hdl.close() myfunc.myclose(fpout_domain) myfunc.myclose(fpout_table) return 0