def g2pair(self,ni,nf,ei,ef1,ef2,ew,numFile,mapFile,weightFile): inobj = [] inobj.append(nm.mcut(f="%s:node"%(ef1),i=ei ).msetstr(a="flag",v=0)) inobj.append(nm.mcut(f="%s:node"%(ef2),i=ei ).msetstr(a="flag",v=0)) if nf : inobj.append(nm.mcut(f="%s:node"%(nf),i=ni ).msetstr(a="flag",v=1)) f = nm.mbest(i=inobj,k="node",s="flag",fr=0,size=1 ) # isolated nodes are set to the end of position in mapping file. # S= must start from 0 (but inside R vertex number will be added one) f <<= nm.mnumber(s="flag,node",a="num",S=0,o=mapFile) f.run() f = None f <<= nm.mcut(f=[ef1,ef2] , i=ei) f <<= nm.mjoin( k=ef1 , K="node" , m=mapFile ,f="num:num1") f <<= nm.mjoin( k=ef2 , K="node" , m=mapFile ,f="num:num2") f <<= nm.mcut(f="num1,num2") f <<= nm.mfsort(f="num1,num2") f <<= nm.msortf(f="num1%n,num2%n",nfno=True) f <<= nm.cmd("tr ',' ' ' " ) f <<= nm.mwrite(o=numFile) f.run() nodeSize=mrecount(i=mapFile) if ew: nm.mcut(f=ew,i=ei,o=weightFile).run() else: ew="weight" nm.msetstr(v=1,a=ew,i=ei).mcut(f=ew,o=weightFile).run() return nodeSize
def runmain(self, edgeFile): import re baseName = re.sub('\.edge$', "", edgeFile) name = re.sub('^.*\/', "", baseName) nodeFile = re.sub('\.edge$', ".node", edgeFile) # convert the original graph to one igraph can handle temp = Mtemp() xxnum = temp.file() xxmap = temp.file() xxout = temp.file() xxscp = temp.file() nodeSize = self.g2pair(nodeFile, self.nf, edgeFile, self.ef1, self.ef2, xxnum, xxmap) # generate R script, and run self.genRscript(self.directed, self.pars, xxnum, nodeSize, xxout, xxscp) if self.verbose: os.system("R --vanilla -q < %s" % (xxscp)) else: os.system("R --vanilla -q --slave < %s 2>/dev/null " % (xxscp)) nm.msetstr(v=name, a="id", i=xxout).mcut(x=True, f="0L,0-1L", o=self.oPath + "/" + name + ".csv").run()
def __mkEdge(key, ef1, ef2, el, ec, ed, ev, ei, norm, mapFile, oFile): # mcal cat用のlabel項目の作成 label = [] if el: for nml in el: label.append("$s{" + nml + "}") evcdStr = [] #エッジの拡大率は固定 er = 10 if ev: evcdStr.append(ev + ":ev") if ec: evcdStr.append(ec + ":ec") if ed: evcdStr.append(ed + ":ed") f = None if el: f <<= nm.mcal(c='cat(\"_\",%s)' % (','.join(label)), a="##label", i=ei) else: f <<= nm.msetstr(v="", a="##label", i=ei) if len(evcdStr) == 0: f <<= nm.mcut(f="%s:key,%s:nam1,%s:nam2,##label:el" % (key, ef1, ef2)) else: f <<= nm.mcut(f="%s:key,%s:nam1,%s:nam2,##label:el,%s" % (key, ef1, ef2, ','.join(evcdStr))) if not ev: f <<= nm.msetstr(v="", a="ev") if not ed: f <<= nm.msetstr(v="", a="ed") if not ec: f <<= nm.msetstr(v="", a="ec") f <<= nm.mnullto(f="key", v="##NULL##") f <<= nm.mjoin(k="key", K="nam", m=mapFile, f="num:keyNum") f <<= nm.mjoin(k="nam1", K="nam", m=mapFile, f="num:num1,leaf:leaf1") f <<= nm.mjoin(k="nam2", K="nam", m=mapFile, f="num:num2,leaf:leaf2") if norm: f <<= nm.mnormalize(f="ev:ev2", c="range") f <<= nm.mcal(c='${ev2}*(%s-1)+1' % (er), a="evv") f <<= nm.mcut( f="key,nam1,nam2,keyNum,num1,num2,el,evv:ev,ed,ec,leaf1,leaf2", o=oFile) else: f <<= nm.mcut( f="key,nam1,nam2,keyNum,num1,num2,el,ev,ed,ec,leaf1,leaf2", o=oFile) f.run()
def __mkMap(key, nf, ni, ef1, ef2, ei, oFile): # leaf nodeの構築 infL = [ nm.mcommon(k=ef1, K=key, m=ei, r=True, i=ei).mcut(f="%s:nam" % (ef1)), nm.mcommon(k=ef2, K=key, m=ei, r=True, i=ei).mcut(f="%s:nam" % (ef2)) ] if ni: infL.append( nm.mcommon(k=nf, K=key, m=ei, r=True, i=ni).mcut(f="%s:nam" % (nf))) xleaf = nm.muniq(i=infL, k="nam") xleaf <<= nm.msetstr(v=1, a="leaf") if ni: inp = [ nm.mcut(f="%s:nam" % (nf), i=ni), nm.mcut(f="%s:nam" % (key), i=ni) ] else: inp = [ nm.mcut(f="%s:nam" % (ef1), i=ei), nm.mcut(f="%s:nam" % (ef2), i=ei), nm.mcut(f="%s:nam" % (key), i=ei) ] f = None f <<= nm.muniq(k="nam", i=inp) f <<= nm.mjoin(k="nam", m=xleaf, f="leaf", n=True) # nullは最初に来るはずなので、mcalでなくmnumberでもnullを0に採番できるはずだが念のために f <<= nm.mcal(c='if(isnull($s{nam}),0,line()+1)', a="num") f <<= nm.mnullto(f="nam", v="##NULL##", o=oFile) f.run()
def g2pair(self,ni,nf,ei,ef1,ef2,ew,numFile,mapFile,weightFile): #MCMD::msgLog("converting graph files into a pair of numbered nodes ...") #wf=MCMD::Mtemp.new #wf1=wf.file #wf2=wf.file #wf3=wf.file allinObj =[] wf1 = nm.mcut(f="%s:node"%(ef1),i=ei ).msetstr(v=0,a="flag") wf2 = nm.mcut(f="%s:node"%(ef2),i=ei ).msetstr(v=0,a="flag") f = None if nf: f <<= nm.mcut(i=[wf1,wf2,nm.mcut(f=nf+":node",i=ni).msetstr(v=1,a="flag")],f="node,flag") f <<= nm.mbest(k="node" , s="flag" , fr=0 ,size=1) else: f <<= nm.mcut(i=[wf1,wf2],f="node,flag") f <<= nm.muniq(k="node") f <<= nm.mnumber(s="flag,node",a="num",S=0,o=mapFile) f.run() f = None f <<= nm.mcut(f=[ef1,ef2],i=ei) f <<= nm.mjoin(k=ef1 , K="node",m=mapFile , f="num:num1") f <<= nm.mjoin(k=ef2 , K="node",m=mapFile , f="num:num2") f <<= nm.mcut(f="num1,num2") f <<= nm.mfsort(f="num1,num2") f <<= nm.msortf(f="num1%n,num2%n",nfno=True) f <<= nm.cmd("tr ',' ' ' ") f <<= nm.mwrite(o=numFile) f.run() if ew : nm.mcut(f=ew,i=ei,o=weightFile).run() else: ew="weight" nm.msetstr(v=1 , a=ew ,i=ei).mcut(f=ew,o=weightFile).run() nodeSize=mrecount(i=mapFile) return nodeSize
def mnest2tree(ei, ef, k, ni=None, nf=None, ev=None, no=None, eo=None): # paracheck追加 efs = ef.split(",") ef1 = efs[0] ef2 = efs[1] f = nm.mcut(f="%s:#orgKey,%s:#orgEf1,%s:#orgEf2" % (k, ef1, ef2), i=ei) temp = mtemp.Mtemp() of = temp.file() with _nu.mcsvout(o=of, f="#orgKey,#orgEf1,#orgEf2,#ef1,#ef2") as oCSV: for flds in f: orgKey = flds[0] orgEf1 = flds[1] orgEf2 = flds[2] oCSV.write([orgKey, orgEf1, orgEf2, orgKey, orgEf1]) oCSV.write([orgKey, orgEf1, orgEf2, orgKey, orgEf2]) f = None f <<= nm.mjoin(k="#orgKey,#orgEf1,#orgEf2", K=[k, ef1, ef2], m=ei, i=of) # 全項目join if ev: f <<= nm.mavg(k="#ef1,#ef2", f=ev) else: f <<= nm.muniq(k="#ef1,#ef2") f <<= nm.mcut(r=True, f="#orgKey,#orgEf1,#orgEf2") f <<= nm.mfldname(f="#ef1:%s,#ef2:%s" % (ef1, ef2), o=eo) f.run() if ni: head = nu.mheader(i=ni) fldnames = [s for s in head if s != nf] commas = ',' * (len(fldnames) - 1) f0 = None f0 <<= nm.mcut(f="%s:%s" % (ef1, nf), i=eo) f0 <<= nm.muniq(k=nf) f0 <<= nm.mcommon(k=nf, m=ni, r=True) f0 <<= nm.msetstr(v=commas, a=fldnames) f = nm.mcut(f=k, r=True, i=[ni, f0]) f <<= nm.msetstr(v="", a=k, o=no) f.run()
def tra2tbl(iFile, oFile, tidFld, itemFld, null=0, dummy=True, aggFld=None, aggStat=None, klassFld=None): klass = None if klassFld is not None and klassFld != "": klass <<= nm.mcut(f=tidFld + "," + klassFld, i=iFile) klass <<= nm.muniq(k=tidFld) f = None # aggFldが指定されていればセル項目を集計 if aggFld is not None and aggFld != "": f <<= nm.mcut(f=tidFld + "," + itemFld + "," + aggFld, i=iFile) f <<= nm.mstats(k=tidFld, f="%s:_cell" % (aggFld), c=aggStat) # aggFldが指定されていなければカウント else: f <<= nm.mcut(f=tidFld + "," + itemFld, i=iFile) # アイテムが出現したかどうか if dummy: f <<= nm.muniq(k=tidFld + "," + itemFld) f <<= nm.msetstr(v=1, a="_cell") # アイテムが何件出現したかどうか else: f <<= nm.mcount(k=tidFld + "," + itemFld, a="_cell") # 横展開 f <<= nm.m2cross(k=tidFld, s=itemFld, f="_cell") # クラス項目が指定されていれば結合する if klassFld is not None and klassFld != "": f <<= nm.mjoin(k=tidFld, m=klass, f=klassFld) # null値を一斉に置換する f <<= nm.mnullto(f="*", v=null, o=oFile) f.run(msg="on")
def run(self): temp = mtemp.Mtemp() ### mtra2gc xxsimgN = temp.file() xxsimgE = temp.file() xxsimgE0 = temp.file() param = {} param["i"] = self.iFile if self.idFN: param["tid"] = self.idFN if self.itemFN: param["item"] = self.itemFN if self.sp1: param["s"] = self.sp1 if self.sp2: param["S"] = self.sp2 ##################### # 異なる向きのconfidenceを列挙するためにsim=C th=0として双方向列挙しておく # 出力データは倍になるが、mfriendsで-directedとすることで元が取れている param["sim"] = "C" param["th"] = "0" param["node_support"] = True if self.numtp: param["num"] = True param["no"] = xxsimgN param["eo"] = xxsimgE0 nt.mtra2gc(**param).run() f = nm.readcsv(xxsimgE0) for i in range(self.filterSize): f <<= nm.mselnum(f=self.filter[i], c="[%s,%s]" % (self.lb[i], self.ub[i])) f <<= nm.writecsv(xxsimgE) f.run() ### mfrirends xxfriends = temp.file() xxfriendE = temp.file() xxw = temp.file() xxf = temp.file() xxff = temp.file() xxor = temp.file() if not os.path.isdir(xxfriends): os.makedirs(xxfriends) col = [["FF000080", "FF888880"], ["0000FF80", "8888FF80"], ["00FF0080", "88FF8880"]] for i in range(len(self.sim)): paramf = {} paramf["ei"] = xxsimgE paramf["ni"] = xxsimgN paramf["ef"] = "node1,node2" paramf["nf"] = "node" paramf["eo"] = xxfriendE paramf["no"] = xxfriends + "/n_" + str(i) paramf["sim"] = self.sim[i] paramf["dir"] = self.dir[i] paramf["rank"] = self.rank[i] paramf["directed"] = True nt.mfriends(**paramf).run() frec2 = nm.mfsort(f="node1,node2", i=xxfriendE) frec2 <<= nm.msummary(k="node1,node2", f=self.sim[i], c="count,mean") frec2 <<= nm.mselstr(f="count", v=2) # node1%0,node2%1,fld,count,mean # a,b,support,2,0.1818181818 # a,d,support,2,0.1818181818 f = nm.mjoin(k="node1,node2", K="node1,node2", m=frec2, f="mean:s1", n=True, i=xxfriendE) f <<= nm.mjoin(k="node2,node1", K="node1,node2", m=frec2, f="mean:s2", n=True) # 1) xxrecs2でsimをjoinできない(s1,s2共にnull)ということは、それは片方向枝なので"F"をつける # 2) 双方向枝a->b,b->aのうちa->bのみ(s1がnullでない)に"W"の印をつける。 # 3) それ以外の枝は"D"として削除 f <<= nm.mcal( c='if(isnull($s{s1}),if(isnull($s{s2}),\"F\",\"D\"),\"W\")', a="dir") f <<= nm.mselstr(f="dir", v="D", r=True) f <<= nm.mcal(c='if($s{dir}==\"W\",$s{s1},$s{%s})' % (self.sim[i]), a="sim") f <<= nm.mchgstr(f="dir:color", c='W:%s,F:%s' % (col[i][0], col[i][1]), A=True) f <<= nm.msetstr(v=[self.sim[i], str(i)], a="simType,simPriority") f <<= nm.mcut(f="simType,simPriority,node1,node2,sim,dir,color", o=xxfriends + "/e_" + str(i)) f.run() # node1%1,node2%0,simType,sim,dir,color # b,a,jaccard,0.3333333333,F,8888FF # j,c,jaccard,0.3333333333,F,8888FF # b,d,jaccard,0.3333333333,F,8888FF # a,e,jaccard,0.5,W,0000FF # d,e,jaccard,0.5,W,0000FF # rule fileの出力 if self.orFile: mmm = nm.mcat(i=xxfriends + "/e_*").muniq(k="node1,node2") nm.mcommon(k="node1,node2", i=xxsimgE, m=mmm, o=self.orFile).run() # マルチ枝の単一化(W優先,パラメータ位置優先) if self.prune: """ # 双方向と片方向に分割 nm.mcat(i=xxfriends+"/e_*").mselstr(f="dir",v="W",o=xxw,u=xxf).run() # 片方向のみの枝を選択 f = nm.mcommon(k="node1,node2",K="node1,node2",r=True,m=xxw,i=xxf) f <<= nm.mcommon(k="node1,node2",K="node2,node1",r=True,m=xxw,o=xxff) f.run() f = nm.mcat(i=xxw+","+xxff).mbest(k="node1,node2",s="dir%r,simPriority%n",o=self.oeFile).run() """ #これだめ fo = nm.mcat(i=xxfriends + "/e_*").mselstr(f="dir", v="W") fu = fo.direction("u") # これは再考 fu <<= nm.mcommon(k="node1,node2", K="node1,node2", r=True, m=fo) fu <<= nm.mcommon(k="node1,node2", K="node2,node1", r=True, m=fo) #f = nm.m2cat() f = nm.mbest(i=[fo, fu], k="node1,node2", s="dir%r,simPriority%n", o=self.oeFile) f.run() else: nm.mcat(i=xxfriends + "/e_*", o=self.oeFile).run() nm.mcat(i=xxfriends + "/n_0", o=self.onFile).run()
def enumerate(self, eArgs): tf = mtemp.Mtemp() # 最小サポートと最小サポート件数 if "minCnt" in eArgs and eArgs["minCnt"] != None: self.minCnt = int(eArgs["minCnt"]) self.minSup = float(self.minCnt) / float(self.db.size) else: self.minSup = float(eArgs["minSup"]) self.minCnt = int(self.minSup * float(self.db.size) + 0.99) # 最大サポートと最大サポート件数 self.maxCnt = None if ("maxCnt" in eArgs and eArgs["maxCnt"] != None) or ("maxSup" in eArgs and eArgs["maxSup"] != None): if "maxCnt" in eArgs and eArgs["maxCnt"] != None: self.maxCnt = int(eArgs["maxCnt"]) self.maxSup = float(self.maxCnt) / float(self.db.size) else: self.maxSup = float(eArgs["maxSup"]) self.maxCnt = int(self.maxSup * float(self.db.size) + 0.99) # 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る if "top" in eArgs and eArgs["top"] != None: self.top = eArgs["top"] # 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る if self.top and self.top > 0: xxtop = tf.file() extTake.lcmseq(type="Cf", K=str(self.top), i=self.file, sup="1", so=xxtop) with open(xxtop, "r") as rfile: self.minCnt = int(rfile.read().strip()) # lcm_seq出力ファイル lcmout = tf.file() # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので # そのときのために空ファイルを生成しておいく。 with open(lcmout, "w") as efile: pass # lcm_seqのパラメータ設定と実行 params = {} if self.msgoff: params["type"] = "CIf_" else: params["type"] = "CIf" if self.maxCnt: params["U"] = str(self.maxCnt) if "minLen" in eArgs: params["l"] = str(eArgs["minLen"]) if 'maxLen' in eArgs: params["u"] = str(eArgs["maxLen"]) if 'gap' in eArgs: params["g"] = str(eArgs["gap"]) if 'win' in eArgs: params["G"] = str(eArgs["win"]) params["i"] = self.file params["sup"] = str(self.minCnt) params["o"] = lcmout # lcm_seq実行 #MCMD::msgLog("#{run}") if 'padding' in eArgs and eArgs[ "padding"]: # padding指定時は、0アイテムを出力しないlcm_seqを実行 extTake.lcmseq_zero(params) else: extTake.lcmseq(params) # パターンのサポートを計算しCSV出力する self.pFile = self.temp.file() items = self.db.items transl = self.temp.file() extTake.lcmtrans(lcmout, "p", transl) f = nm.mdelnull(f="pattern", i=transl) f <<= nm.mvreplace(vf="pattern", m=items.file, K=items.idFN, f=items.itemFN) f <<= nm.msetstr(v=self.db.size, a="total") f <<= nm.mcal(c='${count}/${total}', a="support") # サポートの計算 f <<= nm.mcut(f="pid,pattern,size,count,total,support") f <<= nm.msortf(f="support%nr", o=self.pFile) f.run() if self.outtf: # トランザクション毎に出現するシーケンスを書き出す #MCMD::msgLog("output tid-patterns ...") self.tFile = self.temp.file() xxw = tf.file() #Mtemp.new.name f = None f <<= nm.mcut(f=self.db.idFN, i=self.db.file) f <<= nm.muniq(k=self.db.idFN) f <<= nm.mnumber(S=0, a="__tid", q=True) f <<= nm.msortf(f="__tid", o=xxw) f.run() translt = self.temp.file() extTake.lcmtrans(lcmout, "t", translt) f = None f <<= nm.msortf(f="__tid", i=translt) f <<= nm.mjoin(k="__tid", m=xxw, f=self.db.idFN) f <<= nm.mcut(f=self.db.idFN + ",pid") f <<= nm.msortf(f=self.db.idFN + ",pid", o=self.tFile) f.run()
def run(self): from datetime import datetime t = datetime.now() wf = nu.Mtemp() xxinp = wf.file() xxmap = wf.file() input = self.ei self.g2pair(self.ni, self.nf, self.ei, self.ef1, self.ef2, xxinp, xxmap) xxmace = wf.file() # maceの出力(tra形式) para = {} if self.msgoff: para["type"] = "Ce_" if self.all else "Me_" else: para["type"] = "Ce" if self.all else "Me" para["i"] = xxinp para["o"] = xxmace if self.minSize: para["l"] = self.minSize if self.maxSize: para["u"] = self.maxSize extTake.mace(para) #MCMD::msgLog("converting the numbered nodes into original name ...") id = nu.mrecount(i=xxmace, nfni=True) # xxpair = wf.file() # 上記traをpair形式に変換したデータ fpair = None fpair <<= nm.mcut(i=xxmace, nfni=True, f="0:num") fpair <<= nm.mnumber(q=True, a="id") fpair <<= nm.mvcount(vf="num:size") fpair <<= nm.mtra(r=True, f="num") # when ni= specified, it add the isolated single cliques. if self.ni: fpair_u = nm.mread(i=fpair) if self.all: fpair_u <<= nm.mselstr(f="size", v=1) fpair_u <<= nm.mcut(f="num") fpair_u <<= nm.muniq(k="num") # select all nodes which are not included in any cliques xxiso = None xxiso <<= nm.mcut(f="num", i=xxmap) xxiso <<= nm.mcommon(k="num", m=fpair_u, r=True) xxiso <<= nm.mnumber(S=id, a="id", q=True) xxiso <<= nm.msetstr(v=1, a="size") xxiso <<= nm.mcut(f="id,num,size") # mcut入れないとおかしくなるあとで直す #ddlist = [fpair.mcut(f="id,num,size"),xxiso] xxpair = nm.mcut(i=[fpair, xxiso], f="id,num,size") else: xxpair = fpair xxpair <<= nm.mjoin(m=xxmap, k="num", f="node") xxpair <<= nm.mcut(f="id,node,size") xxpair <<= nm.msortf(f="id,node", o=self.oFile) xxpair.run() procTime = datetime.now() - t # ログファイル出力 if self.logFile: kv = [["key", "value"]] for k, v in self.args.items(): kv.append([k, str(v)]) kv.append(["time", str(procTime)]) nm.writecsv(i=kv, o=self.logFile).run()
def __mkNode(key, nf, nl, nv, nc, ni, ef1, ef2, ei, noiso, norm, mapFile, oFile): xbyE = None xbyN = None # edgeファイルからnode情報を生成 # noiso(孤立node排除)の場合は、edgeにあってnodeにないidを省く必要があるので計算する。 if ni == None or (ni != None and noiso): inp = [ nm.mcut(f="%s:key,%s:nam,%s:nl" % (key, ef1, ef1), i=ei), nm.mcut(f="%s:key,%s:nam,%s:nl" % (key, ef2, ef2), i=ei) ] xbyE <<= nm.mnullto(i=inp, f="key", v="##NULL##") xbyE <<= nm.muniq(k="key,nam") xbyE <<= nm.mjoin(k="key", K="nam", m=mapFile, f="num:keyNum") xbyE <<= nm.mjoin(k="nam", K="nam", m=mapFile, f="num,leaf") xbyE <<= nm.msetstr(v=",,,,", a="nv,nc,nlKey,nvKey,ncKey") xbyE <<= nm.mcut( f="key,nam,keyNum,num,nl,nv,nc,leaf,nvKey,ncKey,nlKey") # nodeファイルから作成 if ni: # mcal cat用のlabel項目の作成 label = [] #label項目 if nl: for nml in nl: label.append(nml) else: label.append("$s{%s}" % (nf)) nvcStr = [] if nv: nvcStr.append('%s:nv' % (nv)) if nc: nvcStr.append('%s:nc' % (nc)) """ # map # nam,leaf,num # ##NULL##,,0 # #1_1,,2 # #1_2,,3 # #1_3,,4 # #2_1,,5 # a,1,6 # b,1,7 # c,1,8 """ f = None f <<= nm.mcal(c='cat("_",%s)' % (','.join(label)), a="##label", i=ni) if len(nvcStr) == 0: f <<= nm.mcut(f='%s:key,%s:nam,##label:nl' % (key, nf)) else: f <<= nm.mcut(f='%s:key,%s:nam,##label:nl,%s' % (key, nf, ','.join(nvcStr))) f <<= nm.mnullto(f="key", v="##NULL##") if not nv: f <<= nm.msetstr(v="", a="nv") if not nc: f <<= nm.msetstr(v="", a="nc") f <<= nm.mjoin(k="key", K="nam", m=mapFile, f="num:keyNum") f <<= nm.mjoin(k="nam", K="nam", m=mapFile, f="num,leaf") if norm: #ノードの拡大率は固定 nr = 3 f <<= nm.mnormalize(f="nv:nv2", c="range") f <<= nm.mcal(c='${nv2}*(%s-1)+1' % (nr), a="nvv") f <<= nm.mcut(f="key,nam,keyNum,num,nl,nvv:nv,nc,leaf") #o=#{xxa}" else: f <<= nm.mcut(f="key,nam,keyNum,num,nl,nv,nc,leaf") #o=#{xxa}" xbyN <<= nm.mjoin(k="keyNum", K="num", m=f, f="nl:nlk,nv:nvKey,nc:ncKey", n=True, i=f) xbyN <<= nm.mcal(c='if(isnull($s{nlk}),$s{key},$s{nlk})', a='nlKey') xbyN <<= nm.mcut(f="nlk", r=True) if ni != None and noiso: nm.mcommon(k="key,nam", m=xbyE, i=xbyN, o=oFile).run() elif ni != None: xbyN.writecsv(o=oFile).run() else: xbyE.writecsv(o=oFile).run()
def enumerate(self, eArgs): pFiles = [] tFiles = [] tf = mtemp.Mtemp() for cName, posSize in self.db.clsNameRecSize.items(): negSize = self.db.traSize - posSize if "minGR" in eArgs: self.minGR = eArgs["minGR"] else: minProb = eArgs["minProb"] if ("minProb" in eArgs) else 0.5 if "uniform" in eArgs and eArgs["uniform"] == True: self.minGR = (minProb / (1 - minProb)) * ( self.db.clsSize - 1) # マニュアルの式(4) else: self.minGR = (minProb / (1 - minProb)) * ( float(negSize) / float(posSize)) # マニュアルの式(4) # 最小サポートと最小サポート件数 # s=0.05 # s=c1:0.05,c2:0.06 # S=10 # S=c1:10,c2:15 if "minCnt" in eArgs: if isinstance(eArgs["minCnt"], dict): self.minPos = eArgs["minCnt"][cName] else: self.minPos = eArgs["minCnt"] else: if isinstance(eArgs["minSup"], dict): self.minPos = int(eArgs["minSup"][cName] * float(posSize) + 0.99) else: self.minPos = int(eArgs["minSup"] * flost(posSize) + 0.99) # 最大サポートと最大サポート件数 if "maxCnt" in eArgs: if isinstance(eArgs["maxCnt"], dict): self.maxPos = eArgs["maxCnt"][cName] else: self.maxPos = eArgs["maxCnt"] elif "maxSup" in eArgs: if isinstance(eArgs["maxSup"], dict): self.maxPos = int(eArgs["maxSup"][cName] * float(posSize) + 0.99) else: self.maxPos = int(eArgs["maxSup"] * float(posSize) + 0.99) else: self.maxPos = None self.sigma[cName] = self.calSigma(self.minPos, self.minGR, posSize, negSize) # lcmのパラメータ設定と実行 # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので # そのときのために空ファイルを生成しておいく。 lcmout = tf.file() # lcm出力ファイル with open(lcmout, "w") as efile: pass runPara = {} if self.msgoff: runPara["type"] = eArgs["type"] + "IA_" else: runPara["type"] = eArgs["type"] + "IA" #if self.maxPos: #rubyだとif @maxCntなってる(どこにも設定されてないので)動いてないはず if self.maxPos: runPara["U"] = self.maxPos if "minLen" in eArgs: runPara["l"] = str(eArgs["minLen"]) if "maxLen" in eArgs: runPara["u"] = str(eArgs["maxLen"]) runPara["w"] = self.weightFile[cName] runPara["i"] = self.file runPara["sup"] = str(self.sigma[cName]) runPara["o"] = lcmout # lcm実行 #MCMD::msgLog("#{run}") #TAKE::run_lcm(run) #print(self.sigma) #print(runPara) #MCMD::msgLog("output patterns to CSV file ...") extTake.lcm(runPara) pFiles.append(self.temp.file()) transle = tf.file() extTake.lcmtrans(lcmout, "e", transle) f = nm.mdelnull(f="pattern", i=transle) f <<= nm.mcal(c='round(${countN},1)', a="neg") f <<= nm.mcal(c='round(${countP}/%s,1)' % (self.posWeight[cName]), a="pos") f <<= nm.mdelnull(f="pattern") #いる? f <<= nm.msetstr(v=cName, a="class") f <<= nm.msetstr(v=posSize, a="posTotal") f <<= nm.msetstr(v=self.minGR, a="minGR") f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,minGR", o=pFiles[-1]) f.run() #s = nutil.mrecount(i=self.file) #MCMD::msgLog("the number of contrast patterns on class `#{cName}' enumerated is #{s}") if self.outtf: # トランザクション毎に出現するパターンを書き出す #MCMD::msgLog("output tid-patterns ...") tFiles.append(self.temp.file()) xxw = tf.file() xxw = nm.mcut(f=self.db.idFN, i=self.db.file) xxw <<= nm.muniq(k=self.db.idFN) xxw <<= nm.mnumber(S=0, a="__tid", q=True) translt = self.temp.file() extTake.lcmtrans(lcmout, "t", translt) f = nm.mjoin(k="__tid", m=xxw, f=self.db.idFN, i=translt) f <<= nm.msetstr(v=cName, a="class") f <<= nm.mcut(f=self.db.idFN + ",class,pid", o=tFiles[-1]) f.run() # クラス別のパターンとtid-pidファイルを統合して最終出力 self.pFile = self.temp.file() self.tFile = self.temp.file() # パターンファイル併合 xxpCat = tf.file() f = nm.mcat(i=",".join(pFiles)) f <<= nm.msortf(f="class,pid") f <<= nm.mnumber(s="class,pid", S=0, a="ppid", o=xxpCat) f.run() # パターンファイル計算 items = self.db.items f = nm.mcut(f="class,ppid:pid,pattern,size,pos,neg,posTotal,minGR", i=xxpCat) f <<= nm.msetstr(v=self.db.traSize, a="total") f <<= nm.mcal(c='${total}-${posTotal}', a="negTotal") # negのトータル件数 f <<= nm.mcal(c='${pos}/${posTotal}', a="support") # サポートの計算 f <<= nm.mcal( c= 'if(${neg}==0,1.797693135e+308,(${pos}/${posTotal})/(${neg}/${negTotal}))', a="growthRate") if "uniform" in eArgs and eArgs["uniform"] == True: f <<= nm.mcal( c='(${pos}/${posTotal})/(${pos}/${posTotal}+(%s-1)*${neg}/${negTotal})' % (self.db.clsSize), a="postProb") else: f <<= nm.mcal(c='${pos}/(${pos}+${neg})', a="postProb") f <<= nm.msel(c='${pos}>=%s&&${growthRate}>=${minGR}' % (self.minPos)) # minSupとminGRによる選択 f <<= nm.mvreplace(vf="pattern", m=items.file, K=items.idFN, f=items.itemFN) f <<= nm.mcut( f="class,pid,pattern,size,pos,neg,posTotal,negTotal,total,support,growthRate,postProb" ) f <<= nm.mvsort(vf="pattern") f <<= nm.msortf(f="class%nr,postProb%nr,pos%nr", o=self.pFile) f.run() # アイテムを包含している冗長なタクソノミを削除 if items.taxonomy: #MCMD::msgLog("reducing redundant rules in terms of taxonomy ...") ##ここは後で zdd = VSOP.constant(0) dt = nm.mcut(i=self.pFile, f="pattern") for fldVal in dt: zdd = zdd + VSOP.itemset(fldVal[0]) zdd = self.reduceTaxo(zdd, self.db.items) xxp1 = tf.file() xxp2 = tf.file() xxp3 = tf.file() zdd.csvout(xxp1) nm.mcut(nfni=True, f="1:pattern", i=xxp1).mvsort(vf="pattern").msortf(f="pattern", o=xxp2).run() nm.msortf(f="pattern", i=self.pFile).mcommon( k="pattern", m=xxp2).msortf(f="class%nr,postProb%nr,pos%nr", o=xxp3).run() shutil.move(xxp3, self.pFile) if self.outtf: # 列挙されたパターンを含むtraのみ選択するためのマスタ xxp4 = nm.mcut(f="class,pid", i=self.pFile) f = nm.mcat(i=",".join(tFiles)) f <<= nm.mjoin(k="class,pid", m=xxpCat, f="ppid") # 全クラス統一pid(ppid)結合 f <<= nm.mcommon(k="class,ppid", K="class,pid", m=xxp4) # 列挙されたパターンの選択 f <<= nm.mcut(f=self.db.idFN + ",class,ppid:pid") f <<= nm.msortf(f=self.db.idFN + ",class,pid", o=self.tFile) f.run()
def mgv(ei, ef, ev=None, ec=None, el=None, ed=None, ni=None, nf=None, nv=None, nc=None, nl=None, nw=1, tp="flat", k=None, o=None, d=False, clusterLabel=False, noiso=False, normalize=False, normalizeEdge=False, normalizeNode=False): # arg check # ei : str (filename) # ef : str | list (fldname size=2) # ev : str | None (fldname) # ec : str | None (fldname) # el : str | list | None (fldname no limit ) # ed : str | None (fldname) # ni : str | None (filename) # nf : str | None (fldname) # nv : str | None (fldname) # nc : str | None (fldname) # nl : str | list | None (fldname no limit ) # tp : str (flat|nest default:flat ) # k : str | None (fldname) # o : str (filename) # d : bool | None # clusterLabel : bool | None # noiso : bool | None # ei if not (isinstance(ei, str)): raise TypeError("ei= unsupport " + str(type(ei))) # ef if isinstance(ef, str): ef = ef.split(',') elif not isinstance(ef, list): raise TypeError("ef= unsupport " + str(type(ef))) if len(ef) < 2: raise TypeError("ef size == 2 ") elif len(ef) > 2: sys.stderr.write('warning : ef size == 2 ') # k if not (isinstance(k, str) or k == None): raise TypeError("k= unsupport " + str(type(k))) # ev if not (isinstance(ev, str) or ev == None): raise TypeError("ev= unsupport " + str(type(ev))) # ec if not (isinstance(ec, str) or ec == None): raise TypeError("ec= unsupport " + str(type(ec))) # el if isinstance(el, str): el = el.split(',') if len(el) == 1 and el[0] == '': el = None elif not (isinstance(el, list) or el == None): raise TypeError("el= unsupport " + str(type(el))) # ed if not (isinstance(ed, str) or ed == None): raise TypeError("ed= unsupport " + str(type(ed))) # ni if not (isinstance(ni, str) or ni == None): raise TypeError("ni= unsupport " + str(type(ni))) # nf if not (isinstance(nf, str) or nf == None): raise TypeError("nf= unsupport " + str(type(nf))) # nv if not (isinstance(nv, str) or nv == None): raise TypeError("nv= unsupport " + str(type(nv))) # nc if not (isinstance(nc, str) or nc == None): raise TypeError("nc= unsupport " + str(type(nc))) # nl if isinstance(nl, str): nl = nl.split(',') if len(nl) == 1 and nl[0] == '': nl = None elif not (isinstance(nl, list) or nl == None): raise TypeError("nl= unsupport " + str(type(nl))) # tp if tp == None: tp = "flat" elif not isinstance(tp, str): raise TypeError("tp= unsupport " + str(type(tp))) # o if isinstance(o, str): oFile = o else: raise TypeError("o= unsupport " + str(type(o))) # d if d == None: d = False if not isinstance(d, bool): raise TypeError("d= unsupport " + str(type(d))) # clusterLabel if clusterLabel == None: clusterLabel = False if not isinstance(clusterLabel, bool): raise TypeError("clusterLabel= unsupport " + str(type(clusterLabel))) # noiso if noiso == None: noiso = False if not isinstance(noiso, bool): raise TypeError("noiso= unsupport " + str(type(noiso))) # noiso if normalize == None: normalize = False if not isinstance(normalize, bool): raise TypeError("noiso= unsupport " + str(type(normalize))) if normalizeEdge == None: normalizeEdge = False if not isinstance(normalizeEdge, bool): raise TypeError("noiso= unsupport " + str(type(normalizeEdge))) if normalizeNode == None: normalizeNode = False if not isinstance(normalizeNode, bool): raise TypeError("noiso= unsupport " + str(type(normalizeNode))) if normalize: normalizeEdge = True normalizeNode = True temp = mtemp.Mtemp() xxni = temp.file() xxei = temp.file() xxmap = temp.file() xxnode = temp.file() xxedge = temp.file() xxdotNode = temp.file() xxdotEdge = temp.file() xxtree = temp.file() mkDir(xxdotNode) mkDir(xxdotEdge) if d: directedStr = "edge []" else: directedStr = "edge [dir=none]" # key追加 (cluster用) if not k: if ni: nm.msetstr(v="", a="#key", i=ni, o=xxni).run() ni = xxni nm.msetstr(v="", a="#key", i=ei, o=xxei).run() ei = xxei k = "#key" #efs = ef.split(",") ef1 = ef[0] ef2 = ef[1] __mkMap(k, nf, ni, ef1, ef2, ei, xxmap) __mkNode(k, nf, nl, nv, nc, ni, ef1, ef2, ei, noiso, normalizeNode, xxmap, xxnode) __mkEdge(k, ef1, ef2, el, ec, ed, ev, ei, normalizeEdge, xxmap, xxedge) # dot用のnodeとedgeデータをcluster別ファイルとして生成 __dotNode(xxnode, nw, tp, clusterLabel, xxdotNode) __dotEdge(xxedge, xxdotEdge) depth = None if tp == "flat": depth = __mkFlat(xxnode, xxtree) # mgvとおなじ elif tp == "nest": # tree構造の処理 # クラスタのみtree構造に格納する depth = __mkTree(xxnode, xxtree) # mgvとおなじ else: raise TypeError("unsupport type " + tp) xxdotTree = temp.file() header = '''digraph G {{ {directedStr} '''.format(directedStr=directedStr) footer = "}\n" __dotTree(xxtree, depth, header, footer, xxdotTree) # mgvとおなじ __replace(xxdotTree, xxdotNode, xxdotEdge, clusterLabel, o)
def run(self, **kw_args): os.environ['KG_ScpVerboseLevel'] = "2" if "msg" in kw_args: if kw_args["msg"] == "on": os.environ['KG_ScpVerboseLevel'] = "4" ln = "#{@pt}line" # make the line number ln = "{}line".format(self.pt) xxmap = self.workf.file() sdata = self.workf.file() # convert the data for sketchport # mkdata xx1 = nm.mnumber(S=0, a=ln, q=True, i=self.iFile) if self.wfH: xx2 = nm.mcut(f=self.wfH + self.tidH + self.elem, i=xx1) else: self.wfH = ["{}wf".format(self.pt)] xx2 = nm.msetstr(v=0, a=self.wfH, i=xx1) xx2 <<= nm.mcut(f=self.wfH + self.tidH + self.elem) fmap = nm.mcut(f=[ln] + self.tidH, i=xx1, o=xxmap) xx2 <<= nm.mcut(f=self.wfH + self.elem, nfno=True) xx2 <<= nm.cmd("tr ',' ' '") xx2 <<= nm.mwrite(o=sdata) nm.runs([fmap, xx2]) # do sort outf = self.workf.file() para = {} if self.dist == "C": para["cosdist"] = self.th elif self.dist == "H": para["hamdist"] = self.th if not self.uc: para["centering"] = True para["auto"] = True para["windowsize"] = self.ws para["seed"] = self.seed para["missingratio"] = self.mr para["i"] = sdata para["o"] = outf status = extMining.sketchsort(para) if status: raise Exception("#ERROR# checking sketchsort messages") tmp = [] for val in self.tidH: tmp.append("{}:{}2".format(val, val)) tid2 = ",".join(tmp) f = nm.mread(i=outf) f <<= nm.cmd("tr ' ' ',' ") f <<= nm.mcut(nfni=True, f="0:eline1,1:eline2,2:distance") f <<= nm.mfsort(f="eline*") # 行番号に対応するtidを取得 f <<= nm.mjoin(k="eline1", K="{}line".format(self.pt), f=self.tidH, m=xxmap) f <<= nm.mjoin(k="eline2", K="{}line".format(self.pt), f=tid2, m=xxmap) f <<= nm.msortf(f="eline1%n,eline2%n") f <<= nm.mcut(r=True, f="eline1,eline2") f <<= nm.msortf(f=self.tidH) f <<= nm.mfldname(q=True, o=self.oFile) f.run() nu.mmsg.endLog(self.__cmdline())
def enumerate(self,eArgs): """ eArgsで与えられた条件で、頻出アイテム集合の列挙を実行する。 :type eArgs: dict :type eArgs['type']: str :type eArgs['minCnt']: int :type eArgs['minSup']: float :type eArgs['maxCnt']: int :type eArgs['maxSup']: float :type eArgs['minLen']: int :type eArgs['maxLen']: int :type eArgs['top']: int :type eArgs['skipTP']: bool【default:False】 :param eArgs: 各種列挙パラメータ :param eArgs['type']: 抽出するアイテム集合の型【'F':頻出集合, 'C':飽和集合, 'M':極大集合】 :param eArgs['minCnt']: 最小サポート(件数) :param eArgs['minSup']: 最小サポート(確率) :param eArgs['maxCnt']: 最大サポート(件数) :param eArgs['maxSup']: 最大サポート(確率) :param eArgs['minLen']: アイテム集合の最小アイテム数(件数) :param eArgs['maxLen']: アイテム集合の最大アイテム数(件数) :param eArgs['top']: 列挙するサポート上位件数(件数) :param eArgs['skipTP']: トランザクションにマッチするパターン(アイテム集合)の出力を行わない。 """ tf=mtemp.Mtemp() self.eArgs=eArgs self.type = eArgs["type"] if "minCnt" in eArgs and eArgs["minCnt"] != None: self.minCnt = int(eArgs["minCnt"]) self.minSup = float(self.minCnt) / float(self.db.traSize) else: self.minSup = float(eArgs["minSup"]) self.minCnt = int(self.minSup * float(self.db.traSize) + 0.99) # 最大サポートと最大サポート件数 self.maxCnt=None if ("maxCnt" in eArgs and eArgs["maxCnt"]!= None) or ( "maxSup" in eArgs and eArgs["maxSup"]!= None): if "maxCnt" in eArgs and eArgs["maxCnt"]!= None: self.maxCnt = int(eArgs["maxCnt"]) self.maxSup = float(self.maxCnt) / float(self.db.traSize) else: self.maxSup = float(eArgs["maxSup"]) self.maxCnt = int(self.maxSup * float(self.db.traSize) + 0.99) params = {} if self.msgoff: params["type"] ="%sIf_"%(self.type) else: params["type"] ="%sIf"%(self.type) if self.maxCnt : params["U"] = str(self.maxCnt) if "minLen" in eArgs and eArgs["minLen"] != None : params["l"] = str(eArgs['minLen']) if "maxLen" in eArgs and eArgs["maxLen"] != None : params["u"] = str(eArgs['maxLen']) # 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る if "top" in eArgs and eArgs["top"] != None : self.top = eArgs["top"] if self.top and self.top>0 : xxtop = tf.file() import copy top_params = copy.deepcopy(params) top_params["i"] = self.file top_params["sup"] = "1" top_params["K"] = str(self.top) top_params["so"] = xxtop import re top_params["type"] = re.sub('_$', '', top_params["type"] ) extTake.lcm(top_params) with open(xxtop, "r") as rfile: self.minCnt = int(rfile.read().strip()) if self.minCnt<0 : self.minCnt=1 self.skipTP=False if "skipTP" in eArgs: self.skipTP=eArgs["skipTP"] # lcm_seq出力ファイル lcmout = tf.file() # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので # そのときのために空ファイルを生成しておいく。 with open(lcmout, "w") as efile: pass # lcm実行 params["i"] = self.file params["sup"] = str(self.minCnt) params["o"] = lcmout extTake.lcm(params) # caliculate one itemset for lift value xxone= tf.file() tpstr = "FIf_" if self.msgoff else "FIf" extTake.lcm(type=tpstr,i=self.file,sup=1,o=xxone,l=1,u=1) # パターンのサポートを計算しCSV出力する #MCMD::msgLog("output patterns to CSV file ...") xxp0 = tf.file() self.pFile = self.temp.file() items=self.db.items trans0 = self.temp.file() extTake.lcmtrans(lcmout,"p",trans0) f = nm.mdelnull(i=trans0,f="pattern") f <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN) f <<= nm.msetstr(v=self.db.traSize,a="total") f <<= nm.mcal(c='${count}/${total}',a="support") f <<= nm.mcut(f="pid,pattern,size,count,total,support") f <<= nm.mvsort(vf="pattern") f <<= nm.msortf(f="pid",o=xxp0) f.run() # xxp0 # pid,count,total,support,pattern # 0,13,13,1,A # 4,6,13,0.4615384615,A B xxp1=tf.file() # taxonomy指定がない場合(2010/11/20追加) if items.taxonomy==None: shutil.move(xxp0,xxp1) # taxonomy指定がある場合 else: #MCMD::msgLog("reducing redundant rules in terms of taxonomy ...") zdd=VSOP.constant(0) fobj = nm.mcut(i=xxp0,f='pattern') for fldVal in fobj: zdd=zdd+VSOP.itemset(fldVal[0]) zdd=self.reduceTaxo(zdd,self.db.items) xxz1=tf.file() xxz2=tf.file() zdd.csvout(xxz1) f0=None f0 <<= nm.mcut(nfni=True,f="1:pattern",i=xxz1) f0 <<= nm.mvsort(vf="pattern") f0 <<= nm.msortf(f="pattern") f=None f <<= nm.msortf(f="pattern",i=xxp0) f <<= nm.mcommon(k="pattern",m=f0) f <<= nm.msortf(f="pid",o=xxp1) f.run() # lift値の計算 transl = tf.file() extTake.lcmtrans(xxone,"p",transl) xxp2 = nm.mdelnull(i=transl,f="pattern") xxp2 <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN) xxp2 <<= nm.msortf(f="pattern") xxp3 = nm.mcut(f="pid,pattern",i=xxp1) xxp3 <<= nm.mtra(f="pattern",r=True) xxp3 <<= nm.mjoin(k="pattern",m=xxp2,f="count:c1") xxp3 <<= nm.mcal(c='ln(${c1})',a="c1ln") xxp3 <<= nm.msum(k="pid",f="c1ln") # p3 # pid,pattern,c1,c1ln # 0,A,13,2.564949357 # 1,E,7,1.945910149 #おかしくなる?=>OK f3 = nm.mjoin(k="pid",f="c1ln",i=xxp1,m=xxp3) f3 <<= nm.mcal(c='round(exp(ln(${count})-${c1ln}+(${size}-1)*ln(${total})),0.0001)',a="lift") f3 <<= nm.mcut(f="pid,size,count,total,support,lift,pattern") f3 <<= nm.msortf(f="support%nr",o=self.pFile) f3.run() #self.size = mrecount.mrecount(i=self.file) #MCMD::msgLog("the number of patterns enumerated is #{@size}") if not self.skipTP: # トランザクション毎に出現するシーケンスを書き出す #MCMD::msgLog("output tid-patterns ...") self.tFile = self.temp.file() xxw3i = tf.file() extTake.lcmtrans(lcmout,"t",xxw3i) xxw1 = nm.mcut(f=self.db.idFN,i=self.db.file).muniq(k=self.db.idFN).mnumber(S=0,a="__tid",q=True).msortf(f="__tid") xxw2 = nm.mcut(f="pid",i=self.pFile) xxw3 = nm.mcommon(k="pid",i=xxw3i,m=xxw2).mjoin(k="__tid",m=xxw1,f=self.db.idFN).mcut(f=self.db.idFN+",pid",o=self.tFile) xxw3.run()
def enumerate(self,eArgs): tf=nu.Mtemp() # 最小サポートと最小サポート件数 if "minCnt" in eArgs : self.minCnt = int(eArgs["minCnt"]) self.minSup = float(self.minCnt)/ float(self.db.size) else: self.minSup = float(eArgs["minSup"]) self.minCnt = int(self.minSup * float(self.db.size) + 0.99) # 最大サポートと最大サポート件数 self.maxCnt=None if "maxCnt" in eArgs or "maxSup" in eArgs: if "maxCnt" in eArgs: self.maxCnt = int(eArgs["maxCnt"]) self.maxSup = float(self.maxCnt)/float(self.db.size) else: self.maxSup = float(eArgs["maxSup"]) self.maxCnt = int(self.maxSup * float(self.db.size) + 0.99) #未使用 #@minProb = eArgs["minProb"].to_f # 事後確率 #@minGR = @minProb/(1-@minProb) # 増加率 #@minGR = eArgs["minGR"].to_f if eArgs["minGR"] # あるクラスをpos、他のクラスをnegにして、パターン列挙した結果ファイル名を格納する pFiles=[] tFiles=[] for cName,posSize in self.db.clsNameRecSize.items(): negSize=self.db.size-posSize # minGRの計算 if "minGR" in eArgs: self.minGR=eArgs["minGR"] else: minProb = eArgs["minProb"] if ( "minProb" in eArgs ) else 0.5 if "uniform" in eArgs and eArgs["uniform"]: self.minGR = (minProb/(1-minProb)) * (self.db.clsSize-1) # マニュアルの式(4) else: self.minGR = (minProb/(1-minProb)) * (float(negSize)/float(posSize)) # マニュアルの式(4) # 最小サポートと最小サポート件数 if "minCnt" in eArgs: self.minPos = eArgs["minCnt"] else: self.minPos = int(eArgs["minSup"] * float(posSize) + 0.99) # 最大サポートと最大サポート件数 if "maxCnt" in eArgs or "maxSup" in eArgs: if "maxCnt" in eArgs: self.maxCnt = int(eArgs["maxCnt"]) else: self.maxCnt = int(eArgs["maxSup"] * float(posSize) + 0.99) self.sigma[cName] = self.calSigma(self.minPos,self.minGR,posSize,negSize) # lcm_seqのパラメータ設定と実行 lcmout = tf.file() # lcm_seq出力ファイル # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので # そのときのために空ファイルを生成しておいく。 with open(lcmout, "w") as efile: pass params = {} if self.msgoff: params["type"] ="CIA_" else: params["type"] ="CIA" if self.maxCnt: # windowサイズ上限 params["U"] = str(self.maxCnt) if "minLen" in eArgs: params["l"] = str(eArgs["minLen"]) if 'maxLen' in eArgs: params["u"] = str(eArgs["maxLen"]) if 'gap' in eArgs: params["g"] = str(eArgs["gap"]) if 'win' in eArgs: params["G"] = str(eArgs["win"]) params["w"] = self.weightFile[cName] params["i"] = self.file params["sup"] = str(self.sigma[cName]) params["o"] = lcmout # lcm_seq実行 #MCMD::msgLog("#{run}") if 'padding' in eArgs and eArgs["padding"]: # padding指定時は、0アイテムを出力しないlcm_seqを実行 extTake.lcmseq_zero(params) else: extTake.lcmseq(params) # パターンのサポートを計算しCSV出力する #MCMD::msgLog("output patterns to CSV file ...") pFiles.append(self.temp.file()) transle = self.temp.file() extTake.lcmtrans(lcmout,"e",transle) # pattern,countP,countN,size,pid f=None f <<= nm.mdelnull(f="pattern",i=transle) f <<= nm.mcal(c='round(${countN},1)',a="neg") f <<= nm.mcal(c='round(${countP}/%s,1)'%(self.posWeight[cName]),a="pos") f <<= nm.mdelnull(f="pattern") f <<= nm.msetstr(v=cName,a="class") f <<= nm.msetstr(v=posSize,a="posTotal") f <<= nm.msetstr(v=self.minGR,a="minGR") f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,minGR",o=pFiles[-1]) f.run() #s = MCMD::mrecount("i=#{pFiles.last}") # 列挙されたパターンの数 #MCMD::msgLog("the number of contrast patterns on class `#{cName}' enumerated is #{s}") if self.outtf : # トランザクション毎に出現するシーケンスを書き出す #MCMD::msgLog("output tid-patterns ...") tFiles.append(self.temp.file()) xxw= tf.file() f=None f <<= nm.mcut(f=self.db.idFN,i=self.db.file) f <<= nm.muniq(k=self.db.idFN) f <<= nm.mnumber(S=0,a="__tid",q=True) f <<= nm.msortf(f="__tid",o=xxw) f.run() nm.mcut(f=self.db.idFN,i=self.db.file).muniq(k=self.db.idFN).mnumber(S=0,a="__tid",q=True,o=xxw).run() translt = self.temp.file() extTake.lcmtrans(lcmout,"t",translt) nm.mjoin(k="__tid",m=xxw,f=self.db.idFN,i=translt).msetstr(v=cName,a="class").mcut(f=self.db.idFN+",class,pid",o=tFiles[-1]).run() # クラス別のパターンとtid-pidファイルを統合して最終出力 self.pFile = self.temp.file() self.tFile = self.temp.file() # パターンファイル併合 xxpCat = tf.file() f = nm.mcat(i=",".join(pFiles)) f <<= nm.msortf(f="class,pid") f <<= nm.mnumber(s="class,pid",S=0,a="ppid",o=xxpCat) f.run() # パターンファイル計算 items=self.db.items f="" f = nm.mcut(f="class,ppid:pid,pattern,size,pos,neg,posTotal,minGR",i=xxpCat) f <<= nm.msetstr(v=self.db.size,a="total") f <<= nm.mcal(c='${total}-${posTotal}',a="negTotal") # negのトータル件数 f <<= nm.mcal(c='${pos}/${posTotal}',a="support") # サポートの計算 f <<= nm.mcal(c='if(${neg}==0,1.797693135e+308,(${pos}/${posTotal})/(${neg}/${negTotal}))',a="growthRate") if "uniform" in eArgs and eArgs["uniform"] == True: f <<= nm.mcal(c='(${pos}/${posTotal})/(${pos}/${posTotal}+(%s-1)*${neg}/${negTotal})'%(self.db.clsSize),a="postProb") else: f <<= nm.mcal(c='${pos}/(${pos}+${neg})',a="postProb") f <<= nm.msel(c='${pos}>=%s&&${growthRate}>=${minGR}'%(self.minPos)) # minSupとminGRによる選択 f <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN) f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,negTotal,total,support,growthRate,postProb") f <<= nm.mvsort(vf="pattern") f <<= nm.msortf(f="class%nr,postProb%nr,pos%nr",o=self.pFile) f.run() if self.outtf : # 列挙されたパターンを含むtraのみ選択するためのマスタ xxp4=nm.mcut(f="class,pid",i=self.pFile) f = nm.mcat(i=",".join(tFiles)) f <<= nm.mjoin(k="class,pid",m=xxpCat,f="ppid") # 全クラス統一pid(ppid)結合 f <<= nm.mcommon(k="class,ppid",K="class,pid",m=xxp4) # 列挙されたパターンの選択 f <<= nm.mcut(f=self.db.idFN+",class,ppid:pid") f <<= nm.msortf(f=self.db.idFN+",class,pid",o=self.tFile) f.run() self.size = nu.mrecount(i=self.pFile)
def mnetpie(ei, ni, ef, nf, o, nodeSizeFld=None, nodeTipsFld=None, nodeColorFld=None, edgeWidthFld=None, edgeColorFld=None, pieDataFld=None, pieTipsFld=None, picFld=None, undirect=False, offline=False): #ei:edge file #ef:egfile if type(ef) is str: ef = ef.split(',') if len(ef) != 2: raise Exception("ef= takes just two field names") if not ((pieDataFld == None and pieTipsFld == None) or (pieDataFld != None and pieTipsFld != None)): raise Exception( "pieDataFld= pieTipsFld= are necessary at the same time") if picFld != None and pieDataFld != None: raise Exception( "picFld= cannot be specified with pieDataFld= pieTipsFld=") if nodeColorFld != None: if picFld != None or pieDataFld != None or pieTipsFld != None: raise Exception( "nodeColorFld= cannot be specified with pieDataFld= pieTipsFld= picFld=" ) if pieDataFld != None and pieTipsFld != None: caseNo = 1 elif picFld != None: caseNo = 2 else: caseNo = 0 tempW = mtemp.Mtemp() xxnode = tempW.file() nodefld = [] nodedmy1 = [] nodedmy2 = [] nodefld.append("%s:node" % (nf)) if nodeSizeFld != None: nodefld.append("%s:nodesize" % (nodeSizeFld)) else: nodedmy1.append("nodesize") nodedmy2.append("50") if nodeTipsFld != None: nodefld.append("%s:nodeT" % (nodeTipFld)) else: nodedmy1.append("nodeT") nodedmy2.append("") if nodeColorFld != None: nodefld.append("%s:nodeClr" % (nodeColorFld)) else: nodedmy1.append("nodeClr") nodedmy2.append("skyblue") if caseNo == 1: nodefld.append("%s:pieD" % (pieDataFld)) nodefld.append("%s:pieT" % (pieTipsFld)) elif caseNo == 2: nodefld.append("%s:pic" % (picFld)) else: nodedmy1.append("pic") nodedmy2.append("") f1 = None f1 <<= nm.mcut(i=ni, f=nodefld) if len(nodedmy1) != 0: f1 <<= nm.msetstr(a=nodedmy1, v=nodedmy2) if caseNo == 1: f1 <<= nm.mshare(k="node", f="pieD:pieDS") f1 <<= nm.mnumber(k="node", a="nodeid", B=True) f2 = nm.muniq(k="pieT", i=f1) f2 <<= nm.mnumber(q=True, a="pieTno") f2 <<= nm.mjoin(k="pieT", f="pieTno", i=f1).iredirect("m") f2 <<= nm.msortf(f="nodeid%n,pieTno%n", o=xxnode) else: f2 = nm.mnumber(a="nodeid%n", q=True, i=f1, o=xxnode) f2.run() xxedge = tempW.file() # MAKE EDGE DATA edgefld = [] edgedmy1 = [] edgedmy2 = [] edgefld.append("%s:edgeS" % (ef[0])) edgefld.append("%s:edgeE" % (ef[1])) if edgeWidthFld != None: edgefld.append("%s:edgesize" % (edgeWidthFld)) else: edgedmy1.append("edgesize") edgedmy2.append("1") if edgeColorFld != None: edgefld.append("%s:edgecolor" % (edgeColorFld)) else: edgedmy1.append("edgecolor") edgedmy2.append("black") f3 = None f3 <<= nm.mcut(i=ei, f=edgefld) if len(edgedmy1) != 0: f3 <<= nm.msetstr(a=edgedmy1, v=edgedmy2) f3 <<= nm.mnumber(a="preNo", q=True) f3 <<= nm.mbest(k="edgeS,edgeE", s="preNo%nr") f3 <<= nm.mnumber(s="preNo%n", a="edgeID") f3 <<= nm.mjoin(k="edgeS", K="node", f="nodeid:edgeSid", m=xxnode) f3 <<= nm.mjoin(k="edgeE", K="node", f="nodeid:edgeEid", m=xxnode) #双方向チェック一応 f4 = None f4 <<= nm.mfsort(i=f3, f="edgeS,edgeE") f4 <<= nm.mcount(k="edgeS,edgeE", a="edgecnt") f4 <<= nm.mselnum(c="[2,]", f="edgecnt") f4 <<= nm.msetstr(a="biflg", v=1) f4 <<= nm.mjoin(k="edgeID", f="biflg", n=True, i=f3).iredirect("m") f4 <<= nm.msortf(f="edgeID%n", o=xxedge) f4.run() gdata = "{\"nodes\":[" if caseNo == 1: nodedatastk = [] nodedatas = "" for val, top, bot in nm.readcsv(xxnode).getline(k="nodeid", otype='dict', q=True): name = val["node"] r = val["nodesize"] title = val["nodeT"] if top: nodedatas = "{\"name\":\"%s\",\"title\":\"%s\",\"r\":%s,\"node\":[" % ( name, title, r) pieTno = val["pieTno"] pieT = val["pieT"] pieDS = val["pieDS"] nodedatas += "{\"group\":%s,\"color\":%s,\"value\":%s,\"title\":\"%s\"}" % ( pieTno, pieDS, pieDS, pieT) if bot: nodedatas += "]}" nodedatastk.append(nodedatas) nodedatas = "" else: nodedatas += "," gdata += ",".join(nodedatastk) else: nodedatastk = [] for val in nm.readcsv(xxnode).getline(otype='dict'): name = val["node"] r = val["nodesize"] title = val["nodeT"] pic = val["pic"] nclr = val["nodeClr"] nodedatas = "{\"name\":\"%s\",\"title\":\"%s\",\"pic\":\"%s\",\"color\":\"%s\",\"r\":%s}" % ( name, title, pic, nclr, r) nodedatastk.append(nodedatas) gdata += ",".join(nodedatastk) gdata += "],\"links\": [" edgedatastk = [] for val in nm.readcsv(xxedge).getline(otype='dict'): es = val["edgeSid"] et = val["edgeEid"] esize = val["edgesize"] ecolor = val["edgecolor"] edgedatas = "{\"source\":%s,\"target\":%s,\"length\":500,\"ewidth\":%s,\"color\":\"%s\"}" % ( es, et, esize, ecolor) edgedatastk.append(edgedatas) gdata += ','.join(edgedatastk) gdata += "]}" direct = ".attr('marker-end','url(#arrowhead)')" if undirect: direct = "" nodeTemplate = ''' node .append("circle") .attr("r",function(d){return d.r/4;}) .attr("fill", function(d){return d.color;}) .append("title") .text(function(d){return d.title;}) ''' nodemakeTemplate = ''' for(var i=0 ; i< graph.nodes.length;i++){ graph.nodes[i].id = i } ''' if pieDataFld != None: nodeTemplate = ''' node.selectAll("path") .data( function(d, i){ return pie(d.node); }) .enter() .append("svg:path") .attr("d", arc) .attr("fill", function(d, i) { return color(d.data.group); }) .append("title") .text(function(d){{return d.data.title;}}) node.append("circle") .attr("r",function(d){{return d.r/4;}}) .attr({ 'fill': 'white' }) .append("title") .text(function(d){{return d.title;}}); ''' nodemakeTemplate = ''' for(var i=0 ; i< graph.nodes.length;i++){ var r = graph.nodes[i].r for(var j=0 ; j< graph.nodes[i].node.length;j++){ graph.nodes[i].node[j]['r'] = r } graph.nodes[i].id = i } ''' elif picFld != None: nodeTemplate = ''' node .append("image") .attr("height",function(d){return d.r;}) .attr("width",function(d){return d.r;}) .attr("x",function(d){return -1 * d.r/2; }) .attr("y",function(d){return -1 * d.r/2; }) .attr("xlink:href",function(d){return d.pic; }) .append("title") .text(function(d){return d.title;}) ''' d3js_str = "<script type='text/javascript' src='http://d3js.org/d3.v3.min.js'></script>" if offline: d3js_str = "<script>%s<script>" % (vjs.ViewJs.d3jsMin()) outTemplate = ''' <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> {d3js_str} <style></style> </head> <body> <script type="text/javascript"> var graph = {gdata} ; var width = 4000, height = 3000; var color = d3.scale.category10(); {nodemakeTemplate}; for(var i=0 ; i< graph.links.length;i++){{ graph.links[i].id = i }} var pie = d3.layout.pie() .sort(null) .value(function(d) {{ return d.value; }}); var arc = d3.svg.arc() .outerRadius( function(d){{ return d.data.r ; }}) .innerRadius( function(d){{ return d.data.r/2 ; }} ); var svg = d3.select("body").append("svg") .attr("width", width) .attr("height", height); d3.select("svg").append('defs').append('marker') .attr({{'id':'arrowhead', 'viewBox':'-0 -5 10 10', 'refX':30, 'refY':0, 'orient':'auto-start-reverse', 'markerWidth':5, 'markerHeight':5, 'xoverflow':'visible'}}) .append('path') .attr('d', 'M 0,-5 L 10 ,0 L 0,5') .attr('fill', '#999') .style('stroke','none'); var g = svg.append("g"); var node = g.selectAll(".node"); var link = g.selectAll(".link"); nodes = graph.nodes links = graph.links var force = d3.layout.force() .linkDistance(200) .linkStrength(3.5) .charge(-3500) .gravity(0.1) .friction(0.95) .size([width, height]) .on("tick", function() {{ link .attr("x1", function(d) {{ return d.source.x; }}) .attr("y1", function(d) {{ return d.source.y; }}) .attr("x2", function(d) {{ return d.target.x; }}) .attr("y2", function(d) {{ return d.target.y; }}); node .attr("x", function(d) {{ return d.x; }}) .attr("y", function(d) {{ return d.y; }}) .attr("transform", function(d) {{ return "translate(" + d.x + "," + d.y + ")"}}); }}); node = node.data(nodes, function( d ) {{ return d.id; }} ); link = link.data(links, function( d ) {{ return d.id; }} ); link .enter() .append("line") .attr("class", "link") .style("stroke", function( d ) {{ return d.color; }} ) .style("stroke-width", function( d ) {{ return d.ewidth; }}) {direct} node .enter() .append("g") .attr("class", "node") .style({{}}) .call(force.drag) .on("contextmenu", function(nd) {{ d3.event.preventDefault(); force.stop() nodes.splice( nd.index, 1 ); links = links.filter(function(nl) {{ return nl.source.index != nd.index && nl.target.index != nd.index; }}); node = node.data(nodes, function( d ) {{ return d.id; }} ); node.exit().remove(); link = link.data( links, function( d ) {{ return d.id; }} ); link.exit().remove(); force.nodes(nodes) .links(links) .start(); }}); {nodeTemplate} node .append("text") .attr("text-anchor", "middle") .style("stroke", "black") .text(function(d) {{ return d.name; }}); force.nodes(nodes) .links(links) .start(); </script> </body> </html> '''.format(d3js_str=d3js_str, gdata=gdata, nodemakeTemplate=nodemakeTemplate, direct=direct, nodeTemplate=nodeTemplate) html = sys.stdout if not o == None: html = open(o, "w") html.write(outTemplate) if not o == None: html.close()
def run(self): from datetime import datetime t = datetime.now() temp=nu.Mtemp() xxsspcin=temp.file() xxmap=temp.file() # traファイルの変換とマップファイルの作成 if self.num : total = self.convN(self.iFile,self.idFN,self.itemFN,xxsspcin,xxmap) else: total = self.conv(self.iFile,self.idFN,self.itemFN,xxsspcin,xxmap) # system "head xxsspcin" # 3 5 0 2 # 4 1 2 # 0 2 3 1 # 1 0 2 # 3 4 0 1 # system "head xxmap" # ##item,##freq%0nr,##num # b,4,0 # d,4,1 # f,4,2 minSupp = int(total*self.minSupPrb) if self.minSupPrb else self.minSupCnt # sspc用simの文字列 if self.sim : if self.sim=="J": sspcSim="R" elif self.sim=="P": sspcSim="P" elif self.sim=="C": sspcSim="i" # sim=省略時はRでth=0とする(sim制約なし) else: sspcSim="R" self.th=0 ############ 列挙本体 ############ xxsspcout=temp.file() tpstr = sspcSim+"ft_" if self.msgoff else sspcSim+"ft" extTake.sspc(type=tpstr,TT=minSupp,i=xxsspcin,th=self.th,o=xxsspcout) ################################## xxtmmp=temp.file() f = nm.mread(i=xxsspcout) f <<= nm.cmd("tr ' ()' ','") f <<= nm.mcut(f="1:i1,2:i2,0:frequency,4:sim",nfni=True) if self.num : f <<= nm.mfldname(f="i1:node1,i2:node2") if self.sim!="C": f <<= nm.mfsort(f="node1,node2") f <<= nm.mjoin(k="node1",K="##item",m=xxmap,f="##freq:frequency1") f <<= nm.mjoin(k="node2",K="##item",m=xxmap,f="##freq:frequency2") else: f <<= nm.mjoin(k="i1",K="##num",m=xxmap,f="##item:node1,##freq:frequency1") f <<= nm.mjoin(k="i2",K="##num",m=xxmap,f="##item:node2,##freq:frequency2") if self.sim!="C": f <<= nm.mcut(f="i1,i2,frequency,sim,node1,node2,frequency1,frequency2,node1:node1x,node2:node2x") f <<= nm.mfsort(f="node1x,node2x") f <<= nm.mcal(c='if($s{node1}==$s{node1x},$s{frequency1},$s{frequency2})',a="freq1") f <<= nm.mcal(c='if($s{node2}==$s{node2x},$s{frequency2},$s{frequency1})',a="freq2") f <<= nm.mcut(f="i1,i2,frequency,sim,node1x:node1,node2x:node2,freq1:frequency1,freq2:frequency2") f <<= nm.msetstr(v=total,a="total") f <<= nm.mcal(c='${frequency}/${frequency1}',a="confidence") f <<= nm.mcal(c='${frequency}/${total}',a="support") f <<= nm.mcal(c='${frequency}/(${frequency1}+${frequency2}-${frequency})',a="jaccard") f <<= nm.mcal(c='(${frequency}*${total})/((${frequency1}*${frequency2}))',a="lift") f <<= nm.mcal(c='(ln(${frequency})+ln(${total})-ln(${frequency1})-ln(${frequency2}))/(ln(${total})-ln(${frequency}))',a="PMI") f <<= nm.mcut(f="node1,node2,frequency,frequency1,frequency2,total,support,confidence,lift,jaccard,PMI") f <<= nm.msortf(f="node1,node2",o=self.oeFile) f.run() if self.onFile: f4 = nm.mcut(f=self.itemFN+":node",i=self.iFile) f4 <<= nm.mcount(k="node",a="frequency") if self.node_support : minstr = "[%s,]"%(minSupp) f4 <<= nm.mselnum(f="frequency",c=minstr) f4 <<= nm.msetstr(v=total,a="total") f4 <<= nm.mcal(c='${frequency}/${total}',a="support") f4 <<= nm.mcut(f="node,support,frequency,total",o=self.onFile) f4.run() procTime=datetime.now()-t # ログファイル出力 if self.logFile : kv=[["key","value"]] for k,v in self.args.items(): kv.append([k,str(v)]) kv.append(["time",str(procTime)]) nm.writecsv(i=kv,o=self.logFile).run()
xxx=[] n1 = nm.mselstr(i=data1,f="a",v="1") n3 = n1.msetstr(a="d1",v="x0",o=xxx) n2 = n1.redirect("u").msetstr(a="d2",v="x1").mproduct(m=n3,f="d1") print(xxx) #import nysol.mod as nm #data1 =[["a","b","c"],["1","2","3"],["4","5","6"]] n1 = None n1 <<= nm.mselstr(i=data1,f="a",v="1",o="t1",u="t2") n1 <<= nm.msetstr(a="add",v="vvv") print(n1.run()) n2 = None n2 <<= nm.mcut(i=data1,f="a,b,c") n2 <<= nm.msum(f="c",o="t4") n2 <<= nm.mcut(f="b,c",o="t5") print(n2.run(msg="on")) xxx1 = [] xxx2 = [] n1 = None n1 <<= nm.mselstr(i=data1,f="a",v="1",o=xxx1,u=xxx2) n1 <<= nm.msetstr(a="add",v="vvv") n1.run()