def convSim(self, ifile, ofile, map1, logDir): f = None f <<= nm.mcut(nfni=True, f="0:tra", i=ifile) f <<= nm.msed(f="tra", c=' $', v="") f <<= nm.mnumber(q=True, S=1, a="num1") f <<= nm.mtra(r=True, f="tra:num11") f <<= nm.mnumber(q=True, S=1, a="order") f <<= nm.mcal(c='${num11}+1', a="num1") f <<= nm.mjoin(k="num1", m=map1, f=self.ef1) f <<= nm.mtra(k="num0", s="order%n,num1%n", f=self.ef1) f <<= nm.mcut(f=self.ef1, o="{}/{}".format(logDir, ofile)) f.run()
def pair2tra(self, ei, ef1, ef2, traFile, mapFile1, mapFile2): f1 = None f1 <<= nm.mcut(f="{}:node1".format(self.ef1), i=self.ei) f1 <<= nm.mdelnull(f="node1") f1 <<= nm.muniq(k="node1") f1 <<= nm.mnumber(s="node1", a="num1", o=mapFile1) f2 = None f2 <<= nm.mcut(f="{}:node2".format(self.ef2), i=self.ei) f2 <<= nm.mdelnull(f="node2") f2 <<= nm.muniq(k="node2") f2 <<= nm.mnumber(s="node2", a="num2", o=mapFile2) f3 = None f3 <<= nm.mcut(f="{}:node1,{}:node2".format(self.ef1, self.ef2), i=self.ei) f3 <<= nm.mjoin(k="node1", m=f1, f="num1") f3 <<= nm.mjoin(k="node2", m=f2, f="num2") f3 <<= nm.mcut(f="num1,num2") f3 <<= nm.msortf(f="num1,num2%n") f3 <<= nm.mtra(k="num1", s="num2%n", f="num2") f3 <<= nm.msortf(f="num1%n") f3 <<= nm.mcut(f="num2", nfno=True, o=traFile) f3.run() os.system("cat " + traFile)
def edge2mtx(self, ei, itra, map1, map2): p1 = nm.mcut(f=self.ef1, i=ei) p1 <<= nm.muniq(k=self.ef1) p1 <<= nm.mdelnull(f=self.ef1) p1 <<= nm.mnumber(q=True, a="num1", S=1, o=map1) p2 = nm.mcut(f=self.ef2, i=ei) p2 <<= nm.muniq(k=self.ef2) p2 <<= nm.mdelnull(f=self.ef2) p2 <<= nm.mnumber(q=True, a="num2", S=1, o=map2) runp = None runp <<= nm.mcut(f=[self.ef1, self.ef2], i=ei) runp <<= nm.mjoin(k=self.ef1, m=p1, f="num1") runp <<= nm.mjoin(k=self.ef2, m=p2, f="num2") runp <<= nm.mcut(f="num1,num2") runp <<= nm.mtra(k="num1", f="num2") runp <<= nm.msortf(f="num1%n") runp <<= nm.mcut(f="num2", nfno=True) runp <<= nm.cmd("tr ',' ' '") runp <<= nm.mwrite(o=itra) #runp <<= nm.mcut(f="num2",nfno=True,o=wff1) runp.run()
def __init__(self, db, outtf=True): self.size = None self.pFile = None self.tFile = None self.temp = mtemp.Mtemp() self.db = db # 入力データベース self.file = self.temp.file() self.outtf = outtf self.weightFile = {} self.posWeight = {} self.sigma = {} self.msgoff = True items = self.db.items for cName, posSize in db.clsNameRecSize.items(): self.weightFile[cName] = self.temp.file() self.posWeight[cName] = self.calOmega(posSize) f = nm.mcut(nfno=True, f=self.db.clsFN, i=self.db.cFile) f <<= nm.mchgstr(nfn=True, f=0, O=-1, o=self.weightFile[cName], c="%s:%s" % (cName, self.posWeight[cName])) f.run() # アイテムをシンボルから番号に変換する。 f = nm.mjoin(k=self.db.itemFN, K=items.itemFN, i=self.db.file, m=items.file, f=items.idFN) f <<= nm.mcut(f=self.db.idFN + "," + items.idFN) f <<= nm.mtra(k=self.db.idFN, f=items.idFN) f <<= nm.mcut(f=items.idFN, nfno=True, o=self.file) f.run()
def toNum(): for size in ["all"]: iFile = "%s/online_all.csv" % datPath oFile1 = "%s/onlineT_all.csv" % datPath # data for Take.core oFile2 = "%s/onlineO_all.basket" % datPath # data for Orange oFile3 = "%s/onlineM_all.csv" % datPath # data for Take f = None f <<= nm.mcut(f="InvoiceNo,StockCode", i=iFile) f <<= nm.muniq(k="InvoiceNo,StockCode") f <<= nm.mfldname(q=True, o=oFile3) f.run(msg=debug) st = None st <<= nm.mcut(f="StockCode", i=iFile) st <<= nm.muniq(k="StockCode") st <<= nm.mnumber(s="StockCode", a="num") f = None f <<= nm.mjoin(k="StockCode", m=st, f="num", i=iFile) f <<= nm.mcut(f="InvoiceNo,num:StockCode") f <<= nm.mtra(k="InvoiceNo", f="StockCode") f <<= nm.mcut(f="StockCode", nfno=True, o=oFile1) f.run(msg=debug) os.system("tr ' ' ',' <%s >%s" % (oFile1, oFile2))
def __init__(self,db): self.db=db self.eArgs=None self.type =None self.minCnt=None self.minSup=None self.maxCnt=None self.maxSup=None self.minLen=None self.maxLen=None self.top =None self.skipTP=False #self.size =None self.pFile =None self.tFile =None self.msgoff = True self.temp=mtemp.Mtemp() self.db = db # 入力データベース self.file=self.temp.file() items=self.db.items # アイテムをシンボルから番号に変換する。 f = nm.mjoin(k=self.db.itemFN,K=items.itemFN,m=items.file,f=items.idFN,i=self.db.file) f <<= nm.mcut(f=self.db.idFN+","+items.idFN) f <<= nm.mtra(k=self.db.idFN,f=items.idFN) f <<= nm.mcut(f=items.idFN,nfno=True,o=self.file) f.run()
def calGsize(self, file): edgesize = nu.mrecount(i=file, nfni=True) f = None f <<= nm.mcut(i=file, f="0:tra", nfni=True) f <<= nm.mtra(f="tra") f <<= nm.muniq(k="tra") f <<= nm.mcount(a="cnt") f <<= nm.mcut(f="cnt") f <<= nm.writelist(dtype="cnt:int") nodesize = f.run()[0][0] return nodesize, edgesize
def __init__(self, db, outtf=True): self.size = None self.temp = mtemp.Mtemp() self.db = db # 入力データベース self.file = self.temp.file() items = self.db.items self.outtf = outtf self.top = None self.msgoff = True # アイテムをシンボルから番号に変換する。 f = nm.mjoin(k=self.db.itemFN, K=items.itemFN, m=items.file, f=items.idFN, i=self.db.file) f <<= nm.mcut(f=self.db.idFN + "," + self.db.timeFN + "," + items.idFN) f <<= nm.mtra(k=self.db.idFN, s=self.db.timeFN + "%n", f=items.idFN) f <<= nm.mcut(f=items.idFN, nfno=True, o=self.file) f.run()
def __init__(self,iFile,itemFN,taxoFN): # アイテムの項目名(=>String) self.itemFN = None # 分類の項目名(=>String) self.taxoFN = None # アイテムの種類数(=>Fixnum) self.itemSize = None # 分類の種類数(=>Fixnum) self.taxoSize = None # taxonomyデータファイル名(=>String) self.file = None self.temp = mtemp.Mtemp() self.iFile = iFile self.iPath = os.path.abspath(self.iFile) self.itemFN = itemFN self.taxoFN = taxoFN # item順に並べ替えてpathに書き出す self.file = self.temp.file() para_it = self.itemFN +"," + self.taxoFN nm.mcut(f=para_it,i=self.iFile).muniq(k=para_it,o=self.file).run(msg="on") f = nm.mcut(f=self.itemFN,i=self.iFile) f <<= nm.mtrafld(f=self.itemFN,a="__fld",valOnly=True) f <<= nm.mtra(f="__fld",r=True) f <<= nm.muniq(k="__fld") f <<= nm.mcount(a="size") f <<= nm.mcut(f="size") xx1 = f.run() self.itemSize = int(xx1[0][0]) xx2 = nm.mcut(f=self.taxoFN+":item",i=self.file).muniq(k="item").mcount(a="size").mcut(f="size").run() self.taxoSize = int(xx2[0][0])
def convRsl(self, ifile, ofile, map1, map2, logDir=None): # 上記iterationで収束したマイクロクラスタグラフを元の節点文字列に直して出力する #MCMD::msgLog("converting the numbered nodes into original name ...") f = None f <<= nm.mcut(nfni=True, f="0:tra", i=ifile) f <<= nm.msed(f="tra", c=' $', v="") f <<= nm.mnumber(q=True, S=1, a="num1") f <<= nm.mtra(r=True, f="tra:num2") f <<= nm.mjoin(k="num2", m=map2, f=self.ef2) f <<= nm.mjoin(k="num1", m=map1, f=self.ef1) f <<= nm.msortf(f="num1%n,num2%n") f <<= nm.mcut(f=[self.ef1, self.ef2]) if logDir: f <<= nm.mfldname(q=True, o="{}/#{ofile}".format(logDir, ofile)) else: f <<= nm.mfldname(q=True, o=ofile) f.run()
def __init__(self,db,outtf=True): self.size = None self.msgoff = True self.temp = nu.Mtemp() self.db = db # 入力データベース self.file = self.temp.file() self.outtf = outtf items = self.db.items # 重みファイルの作成 # pos,negのTransactionオブジェクトに対してLCMが扱う整数アイテムによるトランザクションファイルを生成する。 # この時、pos,negを併合して一つのファイルとして作成され(@wNumTraFile)、 # 重みファイル(@weightFile[クラス])の作成は以下の通り。 # 1.対象クラスをpos、その他のクラスをnegとする。 # 2. negの重みは-1に設定し、posの重みはcalOmegaで計算した値。 # 3.@wNumTraFileの各行のクラスに対応した重みデータを出力する(1項目のみのデータ)。 self.weightFile = {} self.posWeight = {} self.sigma = {} for cName,posSize in db.clsNameRecSize.items(): self.weightFile[cName] = self.temp.file() self.posWeight[cName] = self.calOmega(posSize) cpara = "%s:%s"%(cName,self.posWeight[cName]) nm.mcut(nfno=True,f=self.db.clsFN,i=self.db.cFile).mchgstr(nfn=True,f=0,O=-1,o=self.weightFile[cName],c=cpara).run() # アイテムをシンボルから番号に変換する。 f=None f <<= nm.mjoin(k=self.db.itemFN,K=items.itemFN,m=items.file,f=items.idFN,i=self.db.file) f <<= nm.mcut(f=self.db.idFN+","+self.db.timeFN+","+items.idFN) f <<= nm.msortf(f=self.db.idFN+","+self.db.timeFN+"%n") f <<= nm.mtra(k=self.db.idFN,f=items.idFN) f <<= nm.mcut(f=items.idFN,nfno=True,o=self.file) f.run()
def run(self): tempW = mtemp.Mtemp() xxtra = tempW.file() xxmap1 = tempW.file() xxmap2 = tempW.file() lcmout = tempW.file() xxt0 = tempW.file() xxp0 = tempW.file() xx3t = tempW.file() xx4t = tempW.file() self.pair2tra(self.ei, self.ef1, self.ef2, xxtra, xxmap1, xxmap2) runPara = {} runPara["type"] = "CIf" runPara["sup"] = 1 runPara["o"] = lcmout runPara["i"] = xxtra if self.minSize2: runPara["l"] = self.minSize2 if self.maxSize2: runPara["u"] = self.maxSize2 extTake.lcm(runPara) extTake.lcmtrans(lcmout, "p", xxt0) f = None f <<= nm.mdelnull(f="pattern", i=xxt0) f <<= nm.mvreplace(vf="pattern", m=xxmap2, K="num2", f="node2") f <<= nm.mcut(f="pid,pattern,size:size2") f <<= nm.mvsort(vf="pattern") f <<= nm.msortf(f="pid") if self.byedge: f_e0 = nm.mtra(f="pattern", i=f, r=True) extTake.lcmtrans(lcmout, "t", xx3t) f_e1 = None f_e1 <<= nm.mjoin(k="__tid", m=xxmap1, f="node1", K="num1", i=xx3t) f_e1 <<= nm.msortf(f="pid") ## xx2 f_e2 = None f_e2 <<= nm.mcount(k="pid", a="size1", i=f_e1) f_e2 <<= nm.mselnum(f="size1", c="[{},{}]".format(self.minSize1, self.maxSize1)) f_e3 = None f_e3 <<= nm.mjoin(k="pid", m=f_e2, f="size1", i=f_e1) f_e3 <<= nm.mnjoin(k="pid", m=f_e0, f="pattern,size2") f_e3 <<= nm.mcut(f="pid:id,node1:{},pattern:{},size1,size2".format( self.ef1, self.ef2), o=self.oFile) f_e3.run() else: extTake.lcmtrans(lcmout, "t", xx4t) f_e4 = None f_e4 <<= nm.mjoin(k="__tid", m=xxmap1, i=xx4t, f="node1", K="num1") f_e4 <<= nm.mtra(k="pid", f="node1") f_e4 <<= nm.mvcount(vf="node1:size1") f_e4 <<= nm.mjoin(k="pid", m=f, f="pattern,size2") f_e4 <<= nm.mselnum(f="size1", c="[{},{}]".format(self.minSize1, self.maxSize1)) f_e4 <<= nm.mvsort(vf="node1,pattern") f_e4 <<= nm.msortf(f="node1,pattern") f_e4 <<= nm.mcut(f="node1:{},pattern:{},size1,size2".format( self.ef1, self.ef2), o=self.oFile) f_e4.run()
def run(self, **kw_args): os.environ["KG_VerboseLevel"] = "2" if "msg" in kw_args: if kw_args["msg"] == "on": os.environ['KG_ScpVerboseLevel'] = "3" temp = Mtemp() xxedge = temp.file() xxnode = temp.file() xxnam2num = temp.file() xxnum2nam = temp.file() xxebase = temp.file() xxbody = temp.file() e1 = None if self.ew: e1 <<= nm.mcut(f="%s:__node1,%s:__node2,%s:__weight" % (self.ef1, self.ef2, self.ew), i=self.eFile) else: e1 <<= nm.mcut(f="%s:__node1,%s:__node2" % (self.ef1, self.ef2), i=self.eFile) e1 <<= nm.muniq(k="__node1,__node2") e2 = nm.mfldname(i=e1, f="__node2:__node1,__node1:__node2") fe = None fe <<= nm.muniq(k="__node1,__node2", i=[e1, e2], o=xxedge) fe.run() # cleaning the node data (remove duplicate nodes) fn = None if self.nFile: if self.nw: fn <<= nm.mcut(f="%s:__node,%s" % (self.nf, self.nw), i=self.nFile) else: fn <<= nm.mcut(f="%s:__node" % (self.nf), i=self.nFile) fn <<= nm.muniq(k="__node", o=xxnode) else: xxen1 = nm.mcut(f="__node1:__node", i=xxedge) xxen2 = nm.mcut(f="__node2:__node", i=xxedge) fn <<= nm.muniq(k="__node", o=xxnode, i=[xxen1, xxen2]) fn.run() # 節点名<=>節点番号変換表の作成 fmap = None fmap <<= nm.mcut(f="__node", i=xxnode) fmap <<= nm.mnumber(a="__num", S=1, q=True, o=xxnam2num) fmap <<= nm.msortf(f="__num", o=xxnum2nam) fmap.run() # 節点ファイルが指定された場合は枝ファイルとの整合性チェック if self.nFile: ncheck = nm.mcut(f="__node1:__node", i=xxedge) ncheck <<= nm.mcommon(k="__node", m=xxnam2num, r=True) nmatch = ncheck.run() if len(nmatch) > 0: raise Exception( "#ERROR# the node named '%s' in the edge file doesn't exist in the node file." % (nmatch[0][0])) # metisのグラフファイルフォーマット # 先頭行n m [fmt] [ncon] # n: 節点数、m:枝数、ncon: 節点weightの数 # 1xx: 節点サイズ有り (not used, meaning always "0") # x1x: 節点weight有り # xx1: 枝がweightを有り # s w_1 w_2 ... w_ncon v_1 e_1 v_2 e_2 ... v_k e_k # s: 節点サイズ (節点サイズは利用不可) # w_x: 節点weight # v_x: 接続のある節点番号(行番号) # e_x: 枝weight # -------------------- # generate edge data using the integer numbered nodes #fnnum = None fnnum = nm.mcut(f="__num:__node_n1", i=xxnam2num) # {xxnnum} fenum = None fenum <<= nm.mjoin(k="__node1", K="__node", f="__num:__node_n1", m=xxnam2num, i=xxedge) fenum <<= nm.mjoin(k="__node2", K="__node", f="__num:__node_n2", m=xxnam2num) fenum <<= nm.msortf(f="__node_n1") #{xxenum} febase = None febase <<= nm.mnjoin(k="__node_n1", m=fenum, i=fnnum, n=True) febase <<= nm.msortf(f="__node_n1%n,__node_n2%n", o=xxebase) #{xxebase}" febase.run() fbody = None if not self.ew: fbody <<= nm.mcut(f="__node_n1,__node_n2", i=xxebase) fbody <<= nm.mtra(k="__node_n1", f="__node_n2", q=True) fbody <<= nm.mcut(f="__node_n2", nfno=True, o=xxbody) # if ew= is specified, merge the weight data into the edge data. else: febody = None febody <<= nm.mcut(f="__node_n1,__node_n2:__v", i=xxebase) febody <<= nm.mnumber(S=0, I=2, a="__seq", q=True) fwbody = None fwbody <<= nm.mcut(f="__node_n1,__weight:__v", i=xxebase) fwbody <<= nm.mnumber(S=1, I=2, a="__seq", q=True) fbody <<= nm.msortf(f="__seq%n", i=[febody, fwbody]) fbody <<= nm.mtra(k="__node_n1", f="__v", q=True) fbody <<= nm.mcut(f="__v", nfno=True, o=xxbody) fbody.run() # xxbody # 2 7 3 8 5 9 # 1 7 3 10 5 11 7 12 # 1 8 2 10 4 13 7 14 # -------------------- # generate node data using integer number if self.nFile and self.nw: # xxnode # __node,v1,v2 # a,1,1 # b,1,1 # c,1,1 xxnbody = temp.file() xxnbody1 = temp.file() fnbody = None fnbody <<= nm.mjoin(k="__node", f="__num", i=xxnode, m=xxnam2num) fnbody <<= nm.msortf(f="__num%n") fnbody <<= nm.mcut(f=self.nw, nfno=True) fnbody <<= nm.cmd("tr ',' ' ' ") # tricky!! fnbody <<= nm.mwrite(o=xxnbody) fnbody.run() # xxnbody # 1 1 # 1 1 # 1 1 # paste the node weight with edge body fnbody1 = None fnbody1 <<= nm.mpaste(nfn=True, m=xxbody, i=xxnbody) fnbody1 <<= nm.cmd("tr ',' ' ' ") fnbody1 <<= nm.mwrite(o=xxnbody1) fnbody1.run() os.system("mv %s %s" % (xxnbody1, xxbody)) # xxbody # 1 1 2 7 3 8 5 9 # 1 1 1 7 3 10 5 11 7 12 # 1 1 1 8 2 10 4 13 7 14 eSize = mrecount(i=xxedge) eSize /= 2 nSize = mrecount(i=xxnode) nwFlag = 1 if self.nw else 0 ewFlag = 1 if self.ew else 0 fmt = "0%d%d" % (nwFlag, ewFlag) xxhead = temp.file() xxgraph = temp.file() os.system("echo '%d %d %s %d' > %s" % (nSize, eSize, fmt, self.ncon, xxhead)) os.system("cat %s %s > %s" % (xxhead, xxbody, xxgraph)) if self.mFile: nm.mfldname(f="__num:num,__node:node", i=xxnum2nam, o=self.mFile).run() if self.dFile: os.system("cp %s %s" % (xxgraph, self.dFile)) if not self.noexe: if self.verbose: os.system( "gpmetis -seed=%d -ptype=%s -ncuts=%d -ufactor=%d %s %d" % (self.seed, self.ptype, self.ncuts, self.ufactor, xxgraph, self.kway)) else: os.system( "gpmetis -seed=%d -ptype=%s -ncuts=%d -ufactor=%d %s %d > /dev/null" % (self.seed, self.ptype, self.ncuts, self.ufactor, xxgraph, self.kway)) import glob if len(glob.glob(xxgraph + ".part.*")) == 0: raise Exception( "#ERROR# command `gpmetis' didn't output any results") # 節点名を数字から元に戻す # #{xxgraph}.part.#{kway} # 1 # 0 # 1 fo = None fo <<= nm.mcut(f="0:cluster", nfni=True, i=xxgraph + ".part." + str(self.kway)) fo <<= nm.mnumber(S=1, a="__num", q=True) fo <<= nm.mjoin(k="__num", f="__node", m=xxnum2nam) fo <<= nm.msortf(f="__node,cluster") if self.nf: fo <<= nm.mcut(f="__node:%s,cluster" % (self.nf), o=self.oFile) else: fo <<= nm.mcut(f="__node:node,cluster", o=self.oFile) fo.run() nu.mmsg.endLog(self.__cmdline())
def run(self): from datetime import datetime t = datetime.now() wf = nu.Mtemp() xxinp = wf.file() xxmap = wf.file() input = self.ei self.g2pair(self.ni, self.nf, self.ei, self.ef1, self.ef2, xxinp, xxmap) xxmace = wf.file() # maceの出力(tra形式) para = {} if self.msgoff: para["type"] = "Ce_" if self.all else "Me_" else: para["type"] = "Ce" if self.all else "Me" para["i"] = xxinp para["o"] = xxmace if self.minSize: para["l"] = self.minSize if self.maxSize: para["u"] = self.maxSize extTake.mace(para) #MCMD::msgLog("converting the numbered nodes into original name ...") id = nu.mrecount(i=xxmace, nfni=True) # xxpair = wf.file() # 上記traをpair形式に変換したデータ fpair = None fpair <<= nm.mcut(i=xxmace, nfni=True, f="0:num") fpair <<= nm.mnumber(q=True, a="id") fpair <<= nm.mvcount(vf="num:size") fpair <<= nm.mtra(r=True, f="num") # when ni= specified, it add the isolated single cliques. if self.ni: fpair_u = nm.mread(i=fpair) if self.all: fpair_u <<= nm.mselstr(f="size", v=1) fpair_u <<= nm.mcut(f="num") fpair_u <<= nm.muniq(k="num") # select all nodes which are not included in any cliques xxiso = None xxiso <<= nm.mcut(f="num", i=xxmap) xxiso <<= nm.mcommon(k="num", m=fpair_u, r=True) xxiso <<= nm.mnumber(S=id, a="id", q=True) xxiso <<= nm.msetstr(v=1, a="size") xxiso <<= nm.mcut(f="id,num,size") # mcut入れないとおかしくなるあとで直す #ddlist = [fpair.mcut(f="id,num,size"),xxiso] xxpair = nm.mcut(i=[fpair, xxiso], f="id,num,size") else: xxpair = fpair xxpair <<= nm.mjoin(m=xxmap, k="num", f="node") xxpair <<= nm.mcut(f="id,node,size") xxpair <<= nm.msortf(f="id,node", o=self.oFile) xxpair.run() procTime = datetime.now() - t # ログファイル出力 if self.logFile: kv = [["key", "value"]] for k, v in self.args.items(): kv.append([k, str(v)]) kv.append(["time", str(procTime)]) nm.writecsv(i=kv, o=self.logFile).run()
def enumerate(self,eArgs): """ eArgsで与えられた条件で、頻出アイテム集合の列挙を実行する。 :type eArgs: dict :type eArgs['type']: str :type eArgs['minCnt']: int :type eArgs['minSup']: float :type eArgs['maxCnt']: int :type eArgs['maxSup']: float :type eArgs['minLen']: int :type eArgs['maxLen']: int :type eArgs['top']: int :type eArgs['skipTP']: bool【default:False】 :param eArgs: 各種列挙パラメータ :param eArgs['type']: 抽出するアイテム集合の型【'F':頻出集合, 'C':飽和集合, 'M':極大集合】 :param eArgs['minCnt']: 最小サポート(件数) :param eArgs['minSup']: 最小サポート(確率) :param eArgs['maxCnt']: 最大サポート(件数) :param eArgs['maxSup']: 最大サポート(確率) :param eArgs['minLen']: アイテム集合の最小アイテム数(件数) :param eArgs['maxLen']: アイテム集合の最大アイテム数(件数) :param eArgs['top']: 列挙するサポート上位件数(件数) :param eArgs['skipTP']: トランザクションにマッチするパターン(アイテム集合)の出力を行わない。 """ tf=mtemp.Mtemp() self.eArgs=eArgs self.type = eArgs["type"] if "minCnt" in eArgs and eArgs["minCnt"] != None: self.minCnt = int(eArgs["minCnt"]) self.minSup = float(self.minCnt) / float(self.db.traSize) else: self.minSup = float(eArgs["minSup"]) self.minCnt = int(self.minSup * float(self.db.traSize) + 0.99) # 最大サポートと最大サポート件数 self.maxCnt=None if ("maxCnt" in eArgs and eArgs["maxCnt"]!= None) or ( "maxSup" in eArgs and eArgs["maxSup"]!= None): if "maxCnt" in eArgs and eArgs["maxCnt"]!= None: self.maxCnt = int(eArgs["maxCnt"]) self.maxSup = float(self.maxCnt) / float(self.db.traSize) else: self.maxSup = float(eArgs["maxSup"]) self.maxCnt = int(self.maxSup * float(self.db.traSize) + 0.99) params = {} if self.msgoff: params["type"] ="%sIf_"%(self.type) else: params["type"] ="%sIf"%(self.type) if self.maxCnt : params["U"] = str(self.maxCnt) if "minLen" in eArgs and eArgs["minLen"] != None : params["l"] = str(eArgs['minLen']) if "maxLen" in eArgs and eArgs["maxLen"] != None : params["u"] = str(eArgs['maxLen']) # 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る if "top" in eArgs and eArgs["top"] != None : self.top = eArgs["top"] if self.top and self.top>0 : xxtop = tf.file() import copy top_params = copy.deepcopy(params) top_params["i"] = self.file top_params["sup"] = "1" top_params["K"] = str(self.top) top_params["so"] = xxtop import re top_params["type"] = re.sub('_$', '', top_params["type"] ) extTake.lcm(top_params) with open(xxtop, "r") as rfile: self.minCnt = int(rfile.read().strip()) if self.minCnt<0 : self.minCnt=1 self.skipTP=False if "skipTP" in eArgs: self.skipTP=eArgs["skipTP"] # lcm_seq出力ファイル lcmout = tf.file() # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので # そのときのために空ファイルを生成しておいく。 with open(lcmout, "w") as efile: pass # lcm実行 params["i"] = self.file params["sup"] = str(self.minCnt) params["o"] = lcmout extTake.lcm(params) # caliculate one itemset for lift value xxone= tf.file() tpstr = "FIf_" if self.msgoff else "FIf" extTake.lcm(type=tpstr,i=self.file,sup=1,o=xxone,l=1,u=1) # パターンのサポートを計算しCSV出力する #MCMD::msgLog("output patterns to CSV file ...") xxp0 = tf.file() self.pFile = self.temp.file() items=self.db.items trans0 = self.temp.file() extTake.lcmtrans(lcmout,"p",trans0) f = nm.mdelnull(i=trans0,f="pattern") f <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN) f <<= nm.msetstr(v=self.db.traSize,a="total") f <<= nm.mcal(c='${count}/${total}',a="support") f <<= nm.mcut(f="pid,pattern,size,count,total,support") f <<= nm.mvsort(vf="pattern") f <<= nm.msortf(f="pid",o=xxp0) f.run() # xxp0 # pid,count,total,support,pattern # 0,13,13,1,A # 4,6,13,0.4615384615,A B xxp1=tf.file() # taxonomy指定がない場合(2010/11/20追加) if items.taxonomy==None: shutil.move(xxp0,xxp1) # taxonomy指定がある場合 else: #MCMD::msgLog("reducing redundant rules in terms of taxonomy ...") zdd=VSOP.constant(0) fobj = nm.mcut(i=xxp0,f='pattern') for fldVal in fobj: zdd=zdd+VSOP.itemset(fldVal[0]) zdd=self.reduceTaxo(zdd,self.db.items) xxz1=tf.file() xxz2=tf.file() zdd.csvout(xxz1) f0=None f0 <<= nm.mcut(nfni=True,f="1:pattern",i=xxz1) f0 <<= nm.mvsort(vf="pattern") f0 <<= nm.msortf(f="pattern") f=None f <<= nm.msortf(f="pattern",i=xxp0) f <<= nm.mcommon(k="pattern",m=f0) f <<= nm.msortf(f="pid",o=xxp1) f.run() # lift値の計算 transl = tf.file() extTake.lcmtrans(xxone,"p",transl) xxp2 = nm.mdelnull(i=transl,f="pattern") xxp2 <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN) xxp2 <<= nm.msortf(f="pattern") xxp3 = nm.mcut(f="pid,pattern",i=xxp1) xxp3 <<= nm.mtra(f="pattern",r=True) xxp3 <<= nm.mjoin(k="pattern",m=xxp2,f="count:c1") xxp3 <<= nm.mcal(c='ln(${c1})',a="c1ln") xxp3 <<= nm.msum(k="pid",f="c1ln") # p3 # pid,pattern,c1,c1ln # 0,A,13,2.564949357 # 1,E,7,1.945910149 #おかしくなる?=>OK f3 = nm.mjoin(k="pid",f="c1ln",i=xxp1,m=xxp3) f3 <<= nm.mcal(c='round(exp(ln(${count})-${c1ln}+(${size}-1)*ln(${total})),0.0001)',a="lift") f3 <<= nm.mcut(f="pid,size,count,total,support,lift,pattern") f3 <<= nm.msortf(f="support%nr",o=self.pFile) f3.run() #self.size = mrecount.mrecount(i=self.file) #MCMD::msgLog("the number of patterns enumerated is #{@size}") if not self.skipTP: # トランザクション毎に出現するシーケンスを書き出す #MCMD::msgLog("output tid-patterns ...") self.tFile = self.temp.file() xxw3i = tf.file() extTake.lcmtrans(lcmout,"t",xxw3i) xxw1 = nm.mcut(f=self.db.idFN,i=self.db.file).muniq(k=self.db.idFN).mnumber(S=0,a="__tid",q=True).msortf(f="__tid") xxw2 = nm.mcut(f="pid",i=self.pFile) xxw3 = nm.mcommon(k="pid",i=xxw3i,m=xxw2).mjoin(k="__tid",m=xxw1,f=self.db.idFN).mcut(f=self.db.idFN+",pid",o=self.tFile) xxw3.run()