Ejemplo n.º 1
0
def __mkMap(key, nf, ni, ef1, ef2, ei, oFile):

    # leaf nodeの構築
    infL = [
        nm.mcommon(k=ef1, K=key, m=ei, r=True, i=ei).mcut(f="%s:nam" % (ef1)),
        nm.mcommon(k=ef2, K=key, m=ei, r=True, i=ei).mcut(f="%s:nam" % (ef2))
    ]
    if ni:
        infL.append(
            nm.mcommon(k=nf, K=key, m=ei, r=True,
                       i=ni).mcut(f="%s:nam" % (nf)))

    xleaf = nm.muniq(i=infL, k="nam")
    xleaf <<= nm.msetstr(v=1, a="leaf")

    if ni:
        inp = [
            nm.mcut(f="%s:nam" % (nf), i=ni),
            nm.mcut(f="%s:nam" % (key), i=ni)
        ]

    else:
        inp = [
            nm.mcut(f="%s:nam" % (ef1), i=ei),
            nm.mcut(f="%s:nam" % (ef2), i=ei),
            nm.mcut(f="%s:nam" % (key), i=ei)
        ]

    f = None
    f <<= nm.muniq(k="nam", i=inp)
    f <<= nm.mjoin(k="nam", m=xleaf, f="leaf", n=True)
    # nullは最初に来るはずなので、mcalでなくmnumberでもnullを0に採番できるはずだが念のために
    f <<= nm.mcal(c='if(isnull($s{nam}),0,line()+1)', a="num")
    f <<= nm.mnullto(f="nam", v="##NULL##", o=oFile)
    f.run()
Ejemplo n.º 2
0
def mnest2tree(ei, ef, k, ni=None, nf=None, ev=None, no=None, eo=None):
    # paracheck追加
    efs = ef.split(",")
    ef1 = efs[0]
    ef2 = efs[1]

    f = nm.mcut(f="%s:#orgKey,%s:#orgEf1,%s:#orgEf2" % (k, ef1, ef2), i=ei)

    temp = mtemp.Mtemp()
    of = temp.file()

    with _nu.mcsvout(o=of, f="#orgKey,#orgEf1,#orgEf2,#ef1,#ef2") as oCSV:
        for flds in f:
            orgKey = flds[0]
            orgEf1 = flds[1]
            orgEf2 = flds[2]
            oCSV.write([orgKey, orgEf1, orgEf2, orgKey, orgEf1])
            oCSV.write([orgKey, orgEf1, orgEf2, orgKey, orgEf2])

    f = None
    f <<= nm.mjoin(k="#orgKey,#orgEf1,#orgEf2", K=[k, ef1, ef2], m=ei,
                   i=of)  # 全項目join
    if ev:
        f <<= nm.mavg(k="#ef1,#ef2", f=ev)
    else:
        f <<= nm.muniq(k="#ef1,#ef2")

    f <<= nm.mcut(r=True, f="#orgKey,#orgEf1,#orgEf2")
    f <<= nm.mfldname(f="#ef1:%s,#ef2:%s" % (ef1, ef2), o=eo)
    f.run()

    if ni:
        head = nu.mheader(i=ni)
        fldnames = [s for s in head if s != nf]
        commas = ',' * (len(fldnames) - 1)

        f0 = None
        f0 <<= nm.mcut(f="%s:%s" % (ef1, nf), i=eo)
        f0 <<= nm.muniq(k=nf)
        f0 <<= nm.mcommon(k=nf, m=ni, r=True)
        f0 <<= nm.msetstr(v=commas, a=fldnames)

        f = nm.mcut(f=k, r=True, i=[ni, f0])
        f <<= nm.msetstr(v="", a=k, o=no)
        f.run()
Ejemplo n.º 3
0
    def run(self, **kw_args):

        os.environ["KG_VerboseLevel"] = "2"
        if "msg" in kw_args:
            if kw_args["msg"] == "on":
                os.environ['KG_ScpVerboseLevel'] = "3"

        temp = Mtemp()
        xxedge = temp.file()
        xxnode = temp.file()
        xxnam2num = temp.file()
        xxnum2nam = temp.file()
        xxebase = temp.file()
        xxbody = temp.file()

        e1 = None
        if self.ew:
            e1 <<= nm.mcut(f="%s:__node1,%s:__node2,%s:__weight" %
                           (self.ef1, self.ef2, self.ew),
                           i=self.eFile)
        else:
            e1 <<= nm.mcut(f="%s:__node1,%s:__node2" % (self.ef1, self.ef2),
                           i=self.eFile)

        e1 <<= nm.muniq(k="__node1,__node2")

        e2 = nm.mfldname(i=e1, f="__node2:__node1,__node1:__node2")

        fe = None
        fe <<= nm.muniq(k="__node1,__node2", i=[e1, e2], o=xxedge)
        fe.run()

        # cleaning the node data (remove duplicate nodes)
        fn = None
        if self.nFile:
            if self.nw:
                fn <<= nm.mcut(f="%s:__node,%s" % (self.nf, self.nw),
                               i=self.nFile)
            else:
                fn <<= nm.mcut(f="%s:__node" % (self.nf), i=self.nFile)

            fn <<= nm.muniq(k="__node", o=xxnode)

        else:
            xxen1 = nm.mcut(f="__node1:__node", i=xxedge)
            xxen2 = nm.mcut(f="__node2:__node", i=xxedge)
            fn <<= nm.muniq(k="__node", o=xxnode, i=[xxen1, xxen2])

        fn.run()

        # 節点名<=>節点番号変換表の作成
        fmap = None
        fmap <<= nm.mcut(f="__node", i=xxnode)
        fmap <<= nm.mnumber(a="__num", S=1, q=True, o=xxnam2num)
        fmap <<= nm.msortf(f="__num", o=xxnum2nam)
        fmap.run()

        # 節点ファイルが指定された場合は枝ファイルとの整合性チェック
        if self.nFile:
            ncheck = nm.mcut(f="__node1:__node", i=xxedge)
            ncheck <<= nm.mcommon(k="__node", m=xxnam2num, r=True)
            nmatch = ncheck.run()
            if len(nmatch) > 0:
                raise Exception(
                    "#ERROR# the node named '%s' in the edge file doesn't exist in the node file."
                    % (nmatch[0][0]))

        # metisのグラフファイルフォーマット
        # 先頭行n m [fmt] [ncon]
        # n: 節点数、m:枝数、ncon: 節点weightの数
        # 1xx: 節点サイズ有り (not used, meaning always "0")
        # x1x: 節点weight有り
        # xx1: 枝がweightを有り
        # s w_1 w_2 ... w_ncon v_1 e_1 v_2 e_2 ... v_k e_k
        # s: 節点サイズ  (節点サイズは利用不可)
        # w_x: 節点weight
        # v_x: 接続のある節点番号(行番号)
        # e_x: 枝weight

        # --------------------
        # generate edge data using the integer numbered nodes
        #fnnum = None
        fnnum = nm.mcut(f="__num:__node_n1", i=xxnam2num)  # {xxnnum}

        fenum = None
        fenum <<= nm.mjoin(k="__node1",
                           K="__node",
                           f="__num:__node_n1",
                           m=xxnam2num,
                           i=xxedge)
        fenum <<= nm.mjoin(k="__node2",
                           K="__node",
                           f="__num:__node_n2",
                           m=xxnam2num)
        fenum <<= nm.msortf(f="__node_n1")  #{xxenum}

        febase = None
        febase <<= nm.mnjoin(k="__node_n1", m=fenum, i=fnnum, n=True)
        febase <<= nm.msortf(f="__node_n1%n,__node_n2%n",
                             o=xxebase)  #{xxebase}"
        febase.run()

        fbody = None
        if not self.ew:
            fbody <<= nm.mcut(f="__node_n1,__node_n2", i=xxebase)
            fbody <<= nm.mtra(k="__node_n1", f="__node_n2", q=True)
            fbody <<= nm.mcut(f="__node_n2", nfno=True, o=xxbody)

        # if ew= is specified, merge the weight data into the edge data.
        else:
            febody = None
            febody <<= nm.mcut(f="__node_n1,__node_n2:__v", i=xxebase)
            febody <<= nm.mnumber(S=0, I=2, a="__seq", q=True)

            fwbody = None
            fwbody <<= nm.mcut(f="__node_n1,__weight:__v", i=xxebase)
            fwbody <<= nm.mnumber(S=1, I=2, a="__seq", q=True)

            fbody <<= nm.msortf(f="__seq%n", i=[febody, fwbody])
            fbody <<= nm.mtra(k="__node_n1", f="__v", q=True)
            fbody <<= nm.mcut(f="__v", nfno=True, o=xxbody)

        fbody.run()
        # xxbody
        # 2 7 3 8 5 9
        # 1 7 3 10 5 11 7 12
        # 1 8 2 10 4 13 7 14

        # --------------------
        # generate node data using integer number
        if self.nFile and self.nw:
            # xxnode
            # __node,v1,v2
            # a,1,1
            # b,1,1
            # c,1,1
            xxnbody = temp.file()
            xxnbody1 = temp.file()
            fnbody = None
            fnbody <<= nm.mjoin(k="__node", f="__num", i=xxnode, m=xxnam2num)
            fnbody <<= nm.msortf(f="__num%n")
            fnbody <<= nm.mcut(f=self.nw, nfno=True)
            fnbody <<= nm.cmd("tr ',' ' ' ")  # tricky!!
            fnbody <<= nm.mwrite(o=xxnbody)
            fnbody.run()
            # xxnbody
            # 1 1
            # 1 1
            # 1 1
            # paste the node weight with edge body
            fnbody1 = None
            fnbody1 <<= nm.mpaste(nfn=True, m=xxbody, i=xxnbody)
            fnbody1 <<= nm.cmd("tr ',' ' ' ")
            fnbody1 <<= nm.mwrite(o=xxnbody1)
            fnbody1.run()
            os.system("mv %s %s" % (xxnbody1, xxbody))

        # xxbody
        # 1 1 2 7 3 8 5 9
        # 1 1 1 7 3 10 5 11 7 12
        # 1 1 1 8 2 10 4 13 7 14

        eSize = mrecount(i=xxedge)
        eSize /= 2
        nSize = mrecount(i=xxnode)
        nwFlag = 1 if self.nw else 0
        ewFlag = 1 if self.ew else 0

        fmt = "0%d%d" % (nwFlag, ewFlag)

        xxhead = temp.file()
        xxgraph = temp.file()

        os.system("echo '%d %d %s %d' > %s" %
                  (nSize, eSize, fmt, self.ncon, xxhead))
        os.system("cat  %s %s > %s" % (xxhead, xxbody, xxgraph))

        if self.mFile:
            nm.mfldname(f="__num:num,__node:node", i=xxnum2nam,
                        o=self.mFile).run()

        if self.dFile:
            os.system("cp %s %s" % (xxgraph, self.dFile))

        if not self.noexe:
            if self.verbose:
                os.system(
                    "gpmetis -seed=%d -ptype=%s -ncuts=%d -ufactor=%d %s %d" %
                    (self.seed, self.ptype, self.ncuts, self.ufactor, xxgraph,
                     self.kway))
            else:
                os.system(
                    "gpmetis -seed=%d -ptype=%s -ncuts=%d -ufactor=%d %s %d  > /dev/null"
                    % (self.seed, self.ptype, self.ncuts, self.ufactor,
                       xxgraph, self.kway))
            import glob
            if len(glob.glob(xxgraph + ".part.*")) == 0:
                raise Exception(
                    "#ERROR# command `gpmetis' didn't output any results")

            # 節点名を数字から元に戻す
            # #{xxgraph}.part.#{kway}
            # 1
            # 0
            # 1
            fo = None
            fo <<= nm.mcut(f="0:cluster",
                           nfni=True,
                           i=xxgraph + ".part." + str(self.kway))
            fo <<= nm.mnumber(S=1, a="__num", q=True)
            fo <<= nm.mjoin(k="__num", f="__node", m=xxnum2nam)
            fo <<= nm.msortf(f="__node,cluster")
            if self.nf:
                fo <<= nm.mcut(f="__node:%s,cluster" % (self.nf), o=self.oFile)
            else:
                fo <<= nm.mcut(f="__node:node,cluster", o=self.oFile)
            fo.run()

        nu.mmsg.endLog(self.__cmdline())
Ejemplo n.º 4
0
tra <<= nm.mcut(f="id,date2:date,ret")

# frequency of one item
freq = None
freq <<= nm.mcut(f="id", i=tra)
freq <<= nm.mcount(k="id", a="freq")
freq <<= nm.mselnum(f="freq", c="[5,]")

# total number of transactions
total = None
total <<= nm.mcut(f="date", i=tra)
total <<= nm.muniq(k="date")
total <<= nm.mcount(a="total")

# frequency of cooccurence of id, and calculate lift values
itemCoFreq = None
itemCoFreq <<= nm.mcut(f="date,id", i=tra)
itemCoFreq <<= nm.mcommon(k="id", m=freq)
itemCoFreq <<= nm.mcombi(k="date", n=2, f="id", a="id1,id2")
itemCoFreq <<= nm.mcut(f="id1,id2")
itemCoFreq <<= nm.mfsort(f="id1,id2")
itemCoFreq <<= nm.mcount(k="id1,id2", a="coFreq")
itemCoFreq <<= nm.mjoin(k="id1", m=freq, K="id", f="freq:freq1")
itemCoFreq <<= nm.mjoin(k="id2", m=freq, K="id", f="freq:freq2")
itemCoFreq <<= nm.mproduct(m=total, f="total")
itemCoFreq <<= nm.mcal(c="(${coFreq}*${total})/(${freq1}*${freq2})", a="lift")
itemCoFreq <<= nm.msel(c="${lift}>10 && ${coFreq}>6")
r = itemCoFreq.run(msg=debug)
print(r)
print(len(r))
Ejemplo n.º 5
0
	def enumerate(self,eArgs):
		tf=nu.Mtemp()

		# 最小サポートと最小サポート件数
		if "minCnt" in eArgs :
			self.minCnt = int(eArgs["minCnt"])
			self.minSup = float(self.minCnt)/ float(self.db.size)
		else:
			self.minSup = float(eArgs["minSup"])
			self.minCnt = int(self.minSup * float(self.db.size) + 0.99)


		# 最大サポートと最大サポート件数
		self.maxCnt=None
		if "maxCnt" in eArgs or "maxSup" in eArgs:
			if "maxCnt" in eArgs:
				self.maxCnt = int(eArgs["maxCnt"])
				self.maxSup = float(self.maxCnt)/float(self.db.size)
			else:
				self.maxSup = float(eArgs["maxSup"])
				self.maxCnt = int(self.maxSup * float(self.db.size) + 0.99)

		#未使用
		#@minProb = eArgs["minProb"].to_f # 事後確率
		#@minGR   = @minProb/(1-@minProb) # 増加率
		#@minGR   = eArgs["minGR"].to_f if eArgs["minGR"]

		# あるクラスをpos、他のクラスをnegにして、パターン列挙した結果ファイル名を格納する
		pFiles=[]
		tFiles=[]
		for cName,posSize in self.db.clsNameRecSize.items(): 
			negSize=self.db.size-posSize

			# minGRの計算
			if "minGR" in eArgs:
				self.minGR=eArgs["minGR"]
			else:
				minProb = eArgs["minProb"] if ( "minProb" in eArgs ) else 0.5
				if "uniform" in eArgs and eArgs["uniform"]:
					self.minGR = (minProb/(1-minProb)) * (self.db.clsSize-1) # マニュアルの式(4)
				else:
					self.minGR = (minProb/(1-minProb)) * (float(negSize)/float(posSize)) # マニュアルの式(4)


			# 最小サポートと最小サポート件数
			if "minCnt" in eArgs:
				self.minPos = eArgs["minCnt"]
			else:
				self.minPos = int(eArgs["minSup"] * float(posSize) + 0.99)

			# 最大サポートと最大サポート件数
			if "maxCnt" in eArgs or "maxSup" in eArgs:
				if "maxCnt" in eArgs:
					self.maxCnt = int(eArgs["maxCnt"])
				else:
 					self.maxCnt = int(eArgs["maxSup"] * float(posSize) + 0.99)


			self.sigma[cName] = self.calSigma(self.minPos,self.minGR,posSize,negSize)

			# lcm_seqのパラメータ設定と実行
			lcmout = tf.file() # lcm_seq出力ファイル
			# 頻出パターンがなかった場合、lcm出力ファイルが生成されないので
			# そのときのために空ファイルを生成しておいく。
			with open(lcmout, "w") as efile:
				pass

			params = {}
			if self.msgoff:
				params["type"] ="CIA_"
			else:
				params["type"] ="CIA"

			if self.maxCnt: # windowサイズ上限
				params["U"] = str(self.maxCnt)
			if "minLen" in eArgs:
				params["l"] = str(eArgs["minLen"])
			if 'maxLen' in eArgs:
				params["u"] = str(eArgs["maxLen"])
			if 'gap' in eArgs:
				params["g"] = str(eArgs["gap"])
			if 'win' in eArgs:
				params["G"] = str(eArgs["win"])

			params["w"] = self.weightFile[cName]
			params["i"] = self.file
			params["sup"] = str(self.sigma[cName])
			params["o"] = lcmout

			# lcm_seq実行
			#MCMD::msgLog("#{run}")
			if 'padding' in eArgs and eArgs["padding"]: # padding指定時は、0アイテムを出力しないlcm_seqを実行
				extTake.lcmseq_zero(params)
			else:
				extTake.lcmseq(params)

			# パターンのサポートを計算しCSV出力する
			#MCMD::msgLog("output patterns to CSV file ...")
			pFiles.append(self.temp.file())
			transle = self.temp.file()

			extTake.lcmtrans(lcmout,"e",transle) # pattern,countP,countN,size,pid

			f=None
			f <<= nm.mdelnull(f="pattern",i=transle)
			f <<= nm.mcal(c='round(${countN},1)',a="neg")
			f <<= nm.mcal(c='round(${countP}/%s,1)'%(self.posWeight[cName]),a="pos")
			f <<= nm.mdelnull(f="pattern")
			f <<= nm.msetstr(v=cName,a="class")
			f <<= nm.msetstr(v=posSize,a="posTotal")
			f <<= nm.msetstr(v=self.minGR,a="minGR")
			f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,minGR",o=pFiles[-1])
			f.run()

			#s = MCMD::mrecount("i=#{pFiles.last}") # 列挙されたパターンの数
			#MCMD::msgLog("the number of contrast patterns on class `#{cName}' enumerated is #{s}")

			if self.outtf :
				# トランザクション毎に出現するシーケンスを書き出す
				#MCMD::msgLog("output tid-patterns ...")
				tFiles.append(self.temp.file())

				xxw= tf.file()
				f=None
				f <<= nm.mcut(f=self.db.idFN,i=self.db.file)
				f <<= nm.muniq(k=self.db.idFN)
				f <<= nm.mnumber(S=0,a="__tid",q=True)
				f <<= nm.msortf(f="__tid",o=xxw)
				f.run()

				nm.mcut(f=self.db.idFN,i=self.db.file).muniq(k=self.db.idFN).mnumber(S=0,a="__tid",q=True,o=xxw).run()
				translt = self.temp.file()
				extTake.lcmtrans(lcmout,"t",translt)
				nm.mjoin(k="__tid",m=xxw,f=self.db.idFN,i=translt).msetstr(v=cName,a="class").mcut(f=self.db.idFN+",class,pid",o=tFiles[-1]).run()


		# クラス別のパターンとtid-pidファイルを統合して最終出力
		self.pFile = self.temp.file()
		self.tFile = self.temp.file()

		# パターンファイル併合
		xxpCat = tf.file()
		f =   nm.mcat(i=",".join(pFiles))
		f <<= nm.msortf(f="class,pid")
		f <<= nm.mnumber(s="class,pid",S=0,a="ppid",o=xxpCat)
		f.run()

		# パターンファイル計算
		items=self.db.items
		f=""
		f =   nm.mcut(f="class,ppid:pid,pattern,size,pos,neg,posTotal,minGR",i=xxpCat)
		f <<= nm.msetstr(v=self.db.size,a="total")
		f <<= nm.mcal(c='${total}-${posTotal}',a="negTotal") # negのトータル件数
		f <<= nm.mcal(c='${pos}/${posTotal}',a="support") # サポートの計算
		f <<= nm.mcal(c='if(${neg}==0,1.797693135e+308,(${pos}/${posTotal})/(${neg}/${negTotal}))',a="growthRate")
		if "uniform" in eArgs and eArgs["uniform"] == True:
			f <<= nm.mcal(c='(${pos}/${posTotal})/(${pos}/${posTotal}+(%s-1)*${neg}/${negTotal})'%(self.db.clsSize),a="postProb")
		else:
			f <<= nm.mcal(c='${pos}/(${pos}+${neg})',a="postProb")

		f <<= nm.msel(c='${pos}>=%s&&${growthRate}>=${minGR}'%(self.minPos)) # minSupとminGRによる選択
		f <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN)
		f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,negTotal,total,support,growthRate,postProb")
		f <<= nm.mvsort(vf="pattern")
		f <<= nm.msortf(f="class%nr,postProb%nr,pos%nr",o=self.pFile)
		f.run()

		if self.outtf :
			# 列挙されたパターンを含むtraのみ選択するためのマスタ
			xxp4=nm.mcut(f="class,pid",i=self.pFile)

			f =   nm.mcat(i=",".join(tFiles))
			f <<= nm.mjoin(k="class,pid",m=xxpCat,f="ppid") # 全クラス統一pid(ppid)結合
			f <<= nm.mcommon(k="class,ppid",K="class,pid",m=xxp4) # 列挙されたパターンの選択
			f <<= nm.mcut(f=self.db.idFN+",class,ppid:pid")
			f <<= nm.msortf(f=self.db.idFN+",class,pid",o=self.tFile)
			f.run()


		self.size = nu.mrecount(i=self.pFile)
Ejemplo n.º 6
0
    def run(self):

        from datetime import datetime
        t = datetime.now()

        wf = nu.Mtemp()
        xxinp = wf.file()
        xxmap = wf.file()
        input = self.ei

        self.g2pair(self.ni, self.nf, self.ei, self.ef1, self.ef2, xxinp,
                    xxmap)

        xxmace = wf.file()  # maceの出力(tra形式)

        para = {}
        if self.msgoff:
            para["type"] = "Ce_" if self.all else "Me_"
        else:
            para["type"] = "Ce" if self.all else "Me"
        para["i"] = xxinp
        para["o"] = xxmace
        if self.minSize:
            para["l"] = self.minSize
        if self.maxSize:
            para["u"] = self.maxSize
        extTake.mace(para)

        #MCMD::msgLog("converting the numbered nodes into original name ...")
        id = nu.mrecount(i=xxmace, nfni=True)

        # xxpair = wf.file() # 上記traをpair形式に変換したデータ

        fpair = None
        fpair <<= nm.mcut(i=xxmace, nfni=True, f="0:num")
        fpair <<= nm.mnumber(q=True, a="id")
        fpair <<= nm.mvcount(vf="num:size")
        fpair <<= nm.mtra(r=True, f="num")

        # when ni= specified, it add the isolated single cliques.
        if self.ni:

            fpair_u = nm.mread(i=fpair)

            if self.all:
                fpair_u <<= nm.mselstr(f="size", v=1)
            fpair_u <<= nm.mcut(f="num")
            fpair_u <<= nm.muniq(k="num")

            # select all nodes which are not included in any cliques
            xxiso = None
            xxiso <<= nm.mcut(f="num", i=xxmap)
            xxiso <<= nm.mcommon(k="num", m=fpair_u, r=True)
            xxiso <<= nm.mnumber(S=id, a="id", q=True)
            xxiso <<= nm.msetstr(v=1, a="size")
            xxiso <<= nm.mcut(f="id,num,size")
            # mcut入れないとおかしくなるあとで直す
            #ddlist = [fpair.mcut(f="id,num,size"),xxiso]
            xxpair = nm.mcut(i=[fpair, xxiso], f="id,num,size")

        else:

            xxpair = fpair

        xxpair <<= nm.mjoin(m=xxmap, k="num", f="node")
        xxpair <<= nm.mcut(f="id,node,size")
        xxpair <<= nm.msortf(f="id,node", o=self.oFile)
        xxpair.run()

        procTime = datetime.now() - t

        # ログファイル出力
        if self.logFile:
            kv = [["key", "value"]]
            for k, v in self.args.items():
                kv.append([k, str(v)])
            kv.append(["time", str(procTime)])
            nm.writecsv(i=kv, o=self.logFile).run()
Ejemplo n.º 7
0
    def enumerate(self, eArgs):

        pFiles = []
        tFiles = []
        tf = mtemp.Mtemp()
        for cName, posSize in self.db.clsNameRecSize.items():
            negSize = self.db.traSize - posSize
            if "minGR" in eArgs:
                self.minGR = eArgs["minGR"]
            else:
                minProb = eArgs["minProb"] if ("minProb" in eArgs) else 0.5
                if "uniform" in eArgs and eArgs["uniform"] == True:
                    self.minGR = (minProb / (1 - minProb)) * (
                        self.db.clsSize - 1)  # マニュアルの式(4)
                else:
                    self.minGR = (minProb / (1 - minProb)) * (
                        float(negSize) / float(posSize))  # マニュアルの式(4)

            # 最小サポートと最小サポート件数
            # s=0.05
            # s=c1:0.05,c2:0.06
            # S=10
            # S=c1:10,c2:15
            if "minCnt" in eArgs:
                if isinstance(eArgs["minCnt"], dict):
                    self.minPos = eArgs["minCnt"][cName]
                else:
                    self.minPos = eArgs["minCnt"]
            else:
                if isinstance(eArgs["minSup"], dict):
                    self.minPos = int(eArgs["minSup"][cName] * float(posSize) +
                                      0.99)
                else:
                    self.minPos = int(eArgs["minSup"] * flost(posSize) + 0.99)

            # 最大サポートと最大サポート件数
            if "maxCnt" in eArgs:
                if isinstance(eArgs["maxCnt"], dict):
                    self.maxPos = eArgs["maxCnt"][cName]
                else:
                    self.maxPos = eArgs["maxCnt"]

            elif "maxSup" in eArgs:
                if isinstance(eArgs["maxSup"], dict):
                    self.maxPos = int(eArgs["maxSup"][cName] * float(posSize) +
                                      0.99)
                else:
                    self.maxPos = int(eArgs["maxSup"] * float(posSize) + 0.99)
            else:
                self.maxPos = None

            self.sigma[cName] = self.calSigma(self.minPos, self.minGR, posSize,
                                              negSize)

            # lcmのパラメータ設定と実行
            # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので
            # そのときのために空ファイルを生成しておいく。
            lcmout = tf.file()  # lcm出力ファイル
            with open(lcmout, "w") as efile:
                pass

            runPara = {}

            if self.msgoff:
                runPara["type"] = eArgs["type"] + "IA_"
            else:
                runPara["type"] = eArgs["type"] + "IA"

            #if self.maxPos: #rubyだとif @maxCntなってる(どこにも設定されてないので)動いてないはず
            if self.maxPos:
                runPara["U"] = self.maxPos

            if "minLen" in eArgs:
                runPara["l"] = str(eArgs["minLen"])

            if "maxLen" in eArgs:
                runPara["u"] = str(eArgs["maxLen"])

            runPara["w"] = self.weightFile[cName]

            runPara["i"] = self.file

            runPara["sup"] = str(self.sigma[cName])

            runPara["o"] = lcmout

            # lcm実行
            #MCMD::msgLog("#{run}")
            #TAKE::run_lcm(run)
            #print(self.sigma)
            #print(runPara)
            #MCMD::msgLog("output patterns to CSV file ...")

            extTake.lcm(runPara)

            pFiles.append(self.temp.file())

            transle = tf.file()
            extTake.lcmtrans(lcmout, "e", transle)

            f = nm.mdelnull(f="pattern", i=transle)
            f <<= nm.mcal(c='round(${countN},1)', a="neg")
            f <<= nm.mcal(c='round(${countP}/%s,1)' % (self.posWeight[cName]),
                          a="pos")
            f <<= nm.mdelnull(f="pattern")  #いる?
            f <<= nm.msetstr(v=cName, a="class")
            f <<= nm.msetstr(v=posSize, a="posTotal")
            f <<= nm.msetstr(v=self.minGR, a="minGR")
            f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,minGR",
                          o=pFiles[-1])
            f.run()

            #s = nutil.mrecount(i=self.file)
            #MCMD::msgLog("the number of contrast patterns on class `#{cName}' enumerated is #{s}")

            if self.outtf:
                # トランザクション毎に出現するパターンを書き出す
                #MCMD::msgLog("output tid-patterns ...")
                tFiles.append(self.temp.file())
                xxw = tf.file()

                xxw = nm.mcut(f=self.db.idFN, i=self.db.file)
                xxw <<= nm.muniq(k=self.db.idFN)
                xxw <<= nm.mnumber(S=0, a="__tid", q=True)

                translt = self.temp.file()
                extTake.lcmtrans(lcmout, "t", translt)

                f = nm.mjoin(k="__tid", m=xxw, f=self.db.idFN, i=translt)
                f <<= nm.msetstr(v=cName, a="class")
                f <<= nm.mcut(f=self.db.idFN + ",class,pid", o=tFiles[-1])
                f.run()

        # クラス別のパターンとtid-pidファイルを統合して最終出力
        self.pFile = self.temp.file()
        self.tFile = self.temp.file()

        # パターンファイル併合
        xxpCat = tf.file()
        f = nm.mcat(i=",".join(pFiles))
        f <<= nm.msortf(f="class,pid")
        f <<= nm.mnumber(s="class,pid", S=0, a="ppid", o=xxpCat)
        f.run()

        # パターンファイル計算
        items = self.db.items
        f = nm.mcut(f="class,ppid:pid,pattern,size,pos,neg,posTotal,minGR",
                    i=xxpCat)
        f <<= nm.msetstr(v=self.db.traSize, a="total")
        f <<= nm.mcal(c='${total}-${posTotal}', a="negTotal")  # negのトータル件数
        f <<= nm.mcal(c='${pos}/${posTotal}', a="support")  # サポートの計算
        f <<= nm.mcal(
            c=
            'if(${neg}==0,1.797693135e+308,(${pos}/${posTotal})/(${neg}/${negTotal}))',
            a="growthRate")

        if "uniform" in eArgs and eArgs["uniform"] == True:
            f <<= nm.mcal(
                c='(${pos}/${posTotal})/(${pos}/${posTotal}+(%s-1)*${neg}/${negTotal})'
                % (self.db.clsSize),
                a="postProb")
        else:
            f <<= nm.mcal(c='${pos}/(${pos}+${neg})', a="postProb")

        f <<= nm.msel(c='${pos}>=%s&&${growthRate}>=${minGR}' %
                      (self.minPos))  # minSupとminGRによる選択
        f <<= nm.mvreplace(vf="pattern",
                           m=items.file,
                           K=items.idFN,
                           f=items.itemFN)
        f <<= nm.mcut(
            f="class,pid,pattern,size,pos,neg,posTotal,negTotal,total,support,growthRate,postProb"
        )
        f <<= nm.mvsort(vf="pattern")
        f <<= nm.msortf(f="class%nr,postProb%nr,pos%nr", o=self.pFile)
        f.run()

        # アイテムを包含している冗長なタクソノミを削除
        if items.taxonomy:
            #MCMD::msgLog("reducing redundant rules in terms of taxonomy ...")
            ##ここは後で
            zdd = VSOP.constant(0)
            dt = nm.mcut(i=self.pFile, f="pattern")

            for fldVal in dt:
                zdd = zdd + VSOP.itemset(fldVal[0])

            zdd = self.reduceTaxo(zdd, self.db.items)

            xxp1 = tf.file()
            xxp2 = tf.file()
            xxp3 = tf.file()
            zdd.csvout(xxp1)

            nm.mcut(nfni=True, f="1:pattern",
                    i=xxp1).mvsort(vf="pattern").msortf(f="pattern",
                                                        o=xxp2).run()
            nm.msortf(f="pattern", i=self.pFile).mcommon(
                k="pattern", m=xxp2).msortf(f="class%nr,postProb%nr,pos%nr",
                                            o=xxp3).run()
            shutil.move(xxp3, self.pFile)

        if self.outtf:
            # 列挙されたパターンを含むtraのみ選択するためのマスタ
            xxp4 = nm.mcut(f="class,pid", i=self.pFile)
            f = nm.mcat(i=",".join(tFiles))
            f <<= nm.mjoin(k="class,pid", m=xxpCat,
                           f="ppid")  # 全クラス統一pid(ppid)結合
            f <<= nm.mcommon(k="class,ppid", K="class,pid",
                             m=xxp4)  # 列挙されたパターンの選択
            f <<= nm.mcut(f=self.db.idFN + ",class,ppid:pid")
            f <<= nm.msortf(f=self.db.idFN + ",class,pid", o=self.tFile)
            f.run()
Ejemplo n.º 8
0
    def run(self):
        temp = mtemp.Mtemp()

        ### mtra2gc
        xxsimgN = temp.file()
        xxsimgE = temp.file()
        xxsimgE0 = temp.file()

        param = {}
        param["i"] = self.iFile
        if self.idFN:
            param["tid"] = self.idFN
        if self.itemFN:
            param["item"] = self.itemFN
        if self.sp1:
            param["s"] = self.sp1
        if self.sp2:
            param["S"] = self.sp2

        #####################
        # 異なる向きのconfidenceを列挙するためにsim=C th=0として双方向列挙しておく
        # 出力データは倍になるが、mfriendsで-directedとすることで元が取れている
        param["sim"] = "C"
        param["th"] = "0"

        param["node_support"] = True
        if self.numtp:
            param["num"] = True
        param["no"] = xxsimgN
        param["eo"] = xxsimgE0

        nt.mtra2gc(**param).run()

        f = nm.readcsv(xxsimgE0)
        for i in range(self.filterSize):
            f <<= nm.mselnum(f=self.filter[i],
                             c="[%s,%s]" % (self.lb[i], self.ub[i]))
        f <<= nm.writecsv(xxsimgE)
        f.run()

        ### mfrirends
        xxfriends = temp.file()
        xxfriendE = temp.file()
        xxw = temp.file()
        xxf = temp.file()
        xxff = temp.file()
        xxor = temp.file()

        if not os.path.isdir(xxfriends):
            os.makedirs(xxfriends)
        col = [["FF000080", "FF888880"], ["0000FF80", "8888FF80"],
               ["00FF0080", "88FF8880"]]

        for i in range(len(self.sim)):
            paramf = {}
            paramf["ei"] = xxsimgE
            paramf["ni"] = xxsimgN
            paramf["ef"] = "node1,node2"
            paramf["nf"] = "node"
            paramf["eo"] = xxfriendE
            paramf["no"] = xxfriends + "/n_" + str(i)
            paramf["sim"] = self.sim[i]
            paramf["dir"] = self.dir[i]
            paramf["rank"] = self.rank[i]
            paramf["directed"] = True

            nt.mfriends(**paramf).run()

            frec2 = nm.mfsort(f="node1,node2", i=xxfriendE)
            frec2 <<= nm.msummary(k="node1,node2",
                                  f=self.sim[i],
                                  c="count,mean")
            frec2 <<= nm.mselstr(f="count", v=2)
            # node1%0,node2%1,fld,count,mean
            # a,b,support,2,0.1818181818
            # a,d,support,2,0.1818181818

            f = nm.mjoin(k="node1,node2",
                         K="node1,node2",
                         m=frec2,
                         f="mean:s1",
                         n=True,
                         i=xxfriendE)
            f <<= nm.mjoin(k="node2,node1",
                           K="node1,node2",
                           m=frec2,
                           f="mean:s2",
                           n=True)
            # 1) xxrecs2でsimをjoinできない(s1,s2共にnull)ということは、それは片方向枝なので"F"をつける
            # 2) 双方向枝a->b,b->aのうちa->bのみ(s1がnullでない)に"W"の印をつける。
            # 3) それ以外の枝は"D"として削除
            f <<= nm.mcal(
                c='if(isnull($s{s1}),if(isnull($s{s2}),\"F\",\"D\"),\"W\")',
                a="dir")
            f <<= nm.mselstr(f="dir", v="D", r=True)
            f <<= nm.mcal(c='if($s{dir}==\"W\",$s{s1},$s{%s})' % (self.sim[i]),
                          a="sim")
            f <<= nm.mchgstr(f="dir:color",
                             c='W:%s,F:%s' % (col[i][0], col[i][1]),
                             A=True)
            f <<= nm.msetstr(v=[self.sim[i], str(i)], a="simType,simPriority")
            f <<= nm.mcut(f="simType,simPriority,node1,node2,sim,dir,color",
                          o=xxfriends + "/e_" + str(i))
            f.run()
            # node1%1,node2%0,simType,sim,dir,color
            # b,a,jaccard,0.3333333333,F,8888FF
            # j,c,jaccard,0.3333333333,F,8888FF
            # b,d,jaccard,0.3333333333,F,8888FF
            # a,e,jaccard,0.5,W,0000FF
            # d,e,jaccard,0.5,W,0000FF

        # rule fileの出力
        if self.orFile:
            mmm = nm.mcat(i=xxfriends + "/e_*").muniq(k="node1,node2")
            nm.mcommon(k="node1,node2", i=xxsimgE, m=mmm, o=self.orFile).run()

        # マルチ枝の単一化(W優先,パラメータ位置優先)
        if self.prune:
            """
			# 双方向と片方向に分割
			nm.mcat(i=xxfriends+"/e_*").mselstr(f="dir",v="W",o=xxw,u=xxf).run()
			# 片方向のみの枝を選択
			f =   nm.mcommon(k="node1,node2",K="node1,node2",r=True,m=xxw,i=xxf)
			f <<= nm.mcommon(k="node1,node2",K="node2,node1",r=True,m=xxw,o=xxff)
			f.run()
			f = nm.mcat(i=xxw+","+xxff).mbest(k="node1,node2",s="dir%r,simPriority%n",o=self.oeFile).run()
			"""
            #これだめ
            fo = nm.mcat(i=xxfriends + "/e_*").mselstr(f="dir", v="W")
            fu = fo.direction("u")  # これは再考
            fu <<= nm.mcommon(k="node1,node2", K="node1,node2", r=True, m=fo)
            fu <<= nm.mcommon(k="node1,node2", K="node2,node1", r=True, m=fo)
            #f  =   nm.m2cat()
            f = nm.mbest(i=[fo, fu],
                         k="node1,node2",
                         s="dir%r,simPriority%n",
                         o=self.oeFile)

            f.run()

        else:
            nm.mcat(i=xxfriends + "/e_*", o=self.oeFile).run()

        nm.mcat(i=xxfriends + "/n_0", o=self.onFile).run()
Ejemplo n.º 9
0
def __mkTree(iFile, oFile):

    temp = mtemp.Mtemp()
    xxbase0 = temp.file()
    xxbase1 = temp.file()
    xxiFile2 = temp.file()
    xxcheck = temp.file()
    """
	# #{iFile}
	# key,nam%0,keyNum,num,nv,nc
	# #2_1,#1_1,4,1,6,1
	# #2_1,#1_2,4,2,0.9999999996,1
	"""

    # keyNumとnum項目のuniqリストを作り、お互いの包含関係でrootノードとleafノードを識別する。
    f0 = nm.mcut(f="keyNum,num", i=iFile)  #{xxiFile1}
    fk = f0.mcut(f="keyNum").muniq(k="keyNum")  #{xxkey}
    fn = f0.mcut(f="num").muniq(k="num")  #{xxnum}

    # root nodesの選択
    fr = nm.mcommon(k="keyNum", K="num", m=fn, i=fk,
                    r=True).mcut(f="keyNum:node0", o=xxbase0)  #{xxbase[0]}

    # leaf nodesの選択
    fl = nm.mcommon(k="num", K="keyNum", m=fk, i=fn,
                    r=True).mcut(f="num")  #{xxleaf}

    # leaf nodeの構造を知る必要はないので入力ファイルのnodeからleafを除外
    f = nm.mcommon(k="num", m=fl, r=True, i=f0, o=xxiFile2)

    nm.runs([f, fr])

    def _xnjoin(inf, outf, mfile, check, no):
        f = nm.mnjoin(k="node%d" % (no),
                      K="keyNum",
                      m=mfile,
                      n=True,
                      f="num:node%d" % (no + 1),
                      i=inf,
                      o=outf)
        fc = nm.mdelnull(i=f, f="node%d" % (no + 1), o=check)
        return fc

    i = 0
    depth = None
    inf = xxbase0
    outf = xxbase1
    '''
	# root nodesファイルから親子関係noodeを次々にjoinしていく
	# xxbase0 : root nodes
	# node0%0
	# 3
	# 4
	# xxbase1
	# node0%0,node1
	# 3,
	# 4,1
	# 4,2
	# xxbase2
	# node0,node1%0,node2
	# 3,,
	# 4,1,
	# 4,2,
	# join項目(node2)の非null項目が0件で終了
	'''

    while True:

        _xnjoin(inf, outf, xxiFile2, xxcheck, i).run()
        size = mrecount(i=xxcheck)

        if size == 0:
            nm.msortf(f="*", i=outf, o=oFile).run()
            depth = i + 1
            break

        # swap f_name
        xxtmp = outf
        outf = inf
        inf = xxtmp
        i += 1

    return depth
Ejemplo n.º 10
0
def __mkNode(key, nf, nl, nv, nc, ni, ef1, ef2, ei, noiso, norm, mapFile,
             oFile):
    xbyE = None
    xbyN = None
    # edgeファイルからnode情報を生成
    # noiso(孤立node排除)の場合は、edgeにあってnodeにないidを省く必要があるので計算する。
    if ni == None or (ni != None and noiso):
        inp = [
            nm.mcut(f="%s:key,%s:nam,%s:nl" % (key, ef1, ef1), i=ei),
            nm.mcut(f="%s:key,%s:nam,%s:nl" % (key, ef2, ef2), i=ei)
        ]
        xbyE <<= nm.mnullto(i=inp, f="key", v="##NULL##")
        xbyE <<= nm.muniq(k="key,nam")
        xbyE <<= nm.mjoin(k="key", K="nam", m=mapFile, f="num:keyNum")
        xbyE <<= nm.mjoin(k="nam", K="nam", m=mapFile, f="num,leaf")
        xbyE <<= nm.msetstr(v=",,,,", a="nv,nc,nlKey,nvKey,ncKey")
        xbyE <<= nm.mcut(
            f="key,nam,keyNum,num,nl,nv,nc,leaf,nvKey,ncKey,nlKey")

    # nodeファイルから作成
    if ni:
        # mcal cat用のlabel項目の作成
        label = []
        #label項目
        if nl:
            for nml in nl:
                label.append(nml)
        else:
            label.append("$s{%s}" % (nf))

        nvcStr = []
        if nv:
            nvcStr.append('%s:nv' % (nv))
        if nc:
            nvcStr.append('%s:nc' % (nc))
        """
		# map
		# nam,leaf,num
		# ##NULL##,,0
		# #1_1,,2
		# #1_2,,3
		# #1_3,,4
		# #2_1,,5
		# a,1,6
		# b,1,7
		# c,1,8
		"""

        f = None
        f <<= nm.mcal(c='cat("_",%s)' % (','.join(label)), a="##label", i=ni)
        if len(nvcStr) == 0:
            f <<= nm.mcut(f='%s:key,%s:nam,##label:nl' % (key, nf))
        else:
            f <<= nm.mcut(f='%s:key,%s:nam,##label:nl,%s' %
                          (key, nf, ','.join(nvcStr)))
        f <<= nm.mnullto(f="key", v="##NULL##")
        if not nv:
            f <<= nm.msetstr(v="", a="nv")
        if not nc:
            f <<= nm.msetstr(v="", a="nc")

        f <<= nm.mjoin(k="key", K="nam", m=mapFile, f="num:keyNum")
        f <<= nm.mjoin(k="nam", K="nam", m=mapFile, f="num,leaf")

        if norm:
            #ノードの拡大率は固定
            nr = 3
            f <<= nm.mnormalize(f="nv:nv2", c="range")
            f <<= nm.mcal(c='${nv2}*(%s-1)+1' % (nr), a="nvv")
            f <<= nm.mcut(f="key,nam,keyNum,num,nl,nvv:nv,nc,leaf")  #o=#{xxa}"
        else:
            f <<= nm.mcut(f="key,nam,keyNum,num,nl,nv,nc,leaf")  #o=#{xxa}"

        xbyN <<= nm.mjoin(k="keyNum",
                          K="num",
                          m=f,
                          f="nl:nlk,nv:nvKey,nc:ncKey",
                          n=True,
                          i=f)
        xbyN <<= nm.mcal(c='if(isnull($s{nlk}),$s{key},$s{nlk})', a='nlKey')
        xbyN <<= nm.mcut(f="nlk", r=True)

    if ni != None and noiso:
        nm.mcommon(k="key,nam", m=xbyE, i=xbyN, o=oFile).run()
    elif ni != None:
        xbyN.writecsv(o=oFile).run()
    else:
        xbyE.writecsv(o=oFile).run()
Ejemplo n.º 11
0
	def enumerate(self,eArgs):
		"""
		eArgsで与えられた条件で、頻出アイテム集合の列挙を実行する。

		:type eArgs: dict
		:type eArgs['type']: str
		:type eArgs['minCnt']: int
		:type eArgs['minSup']: float
		:type eArgs['maxCnt']: int
		:type eArgs['maxSup']: float
		:type eArgs['minLen']: int
		:type eArgs['maxLen']: int
		:type eArgs['top']: int
		:type eArgs['skipTP']: bool【default:False】
		:param eArgs: 各種列挙パラメータ
		:param eArgs['type']: 抽出するアイテム集合の型【'F':頻出集合, 'C':飽和集合, 'M':極大集合】
		:param eArgs['minCnt']: 最小サポート(件数)
		:param eArgs['minSup']: 最小サポート(確率)
		:param eArgs['maxCnt']: 最大サポート(件数)
		:param eArgs['maxSup']: 最大サポート(確率)
		:param eArgs['minLen']: アイテム集合の最小アイテム数(件数)
		:param eArgs['maxLen']: アイテム集合の最大アイテム数(件数)
		:param eArgs['top']: 列挙するサポート上位件数(件数)
		:param eArgs['skipTP']: トランザクションにマッチするパターン(アイテム集合)の出力を行わない。
		"""

		tf=mtemp.Mtemp()
		self.eArgs=eArgs
		self.type = eArgs["type"]

		if "minCnt" in eArgs and eArgs["minCnt"] != None:
			self.minCnt = int(eArgs["minCnt"])
			self.minSup = float(self.minCnt) / float(self.db.traSize)
		else:
			self.minSup = float(eArgs["minSup"])
			self.minCnt = int(self.minSup * float(self.db.traSize) + 0.99)

		# 最大サポートと最大サポート件数
		self.maxCnt=None
		if ("maxCnt" in eArgs and  eArgs["maxCnt"]!= None) or ( "maxSup" in eArgs and eArgs["maxSup"]!= None):
			if "maxCnt" in eArgs and eArgs["maxCnt"]!= None:
				self.maxCnt = int(eArgs["maxCnt"])
				self.maxSup = float(self.maxCnt) / float(self.db.traSize)
			else:
				self.maxSup    = float(eArgs["maxSup"])
				self.maxCnt = int(self.maxSup * float(self.db.traSize) + 0.99)


		params = {}
		if self.msgoff:
			params["type"] ="%sIf_"%(self.type)
		else:
			params["type"] ="%sIf"%(self.type)


		if self.maxCnt :
			params["U"] = str(self.maxCnt)

		if "minLen" in eArgs and eArgs["minLen"] != None :
			params["l"] = str(eArgs['minLen'])
		
		if "maxLen" in eArgs and eArgs["maxLen"] != None :
			params["u"] = str(eArgs['maxLen'])

		# 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る
		if "top" in eArgs and eArgs["top"] != None :
			self.top = eArgs["top"]

		if self.top and self.top>0 :

			xxtop = tf.file()
			import copy
			top_params = copy.deepcopy(params)
			top_params["i"] = self.file
			top_params["sup"] = "1"
			top_params["K"] = str(self.top)
			top_params["so"] = xxtop
			import re
			top_params["type"] = re.sub('_$', '', top_params["type"] )

			extTake.lcm(top_params)

			with open(xxtop, "r") as rfile:
				self.minCnt = int(rfile.read().strip())

			if self.minCnt<0 :
				self.minCnt=1 


		self.skipTP=False
		if "skipTP" in eArgs:
			self.skipTP=eArgs["skipTP"]

		# lcm_seq出力ファイル
		lcmout = tf.file()

		# 頻出パターンがなかった場合、lcm出力ファイルが生成されないので
		# そのときのために空ファイルを生成しておいく。
		with open(lcmout, "w") as efile:
			pass

		# lcm実行
		params["i"] = self.file
		params["sup"] = str(self.minCnt)
		params["o"] = lcmout
		extTake.lcm(params)

		# caliculate one itemset for lift value
		xxone= tf.file()
		tpstr = "FIf_" if self.msgoff else "FIf"

		extTake.lcm(type=tpstr,i=self.file,sup=1,o=xxone,l=1,u=1)


		# パターンのサポートを計算しCSV出力する
		#MCMD::msgLog("output patterns to CSV file ...")

		xxp0 = tf.file()
		self.pFile = self.temp.file()
		items=self.db.items
		trans0 = self.temp.file()

		extTake.lcmtrans(lcmout,"p",trans0)

		f =   nm.mdelnull(i=trans0,f="pattern")
		f <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN)
		f <<= nm.msetstr(v=self.db.traSize,a="total")
		f <<= nm.mcal(c='${count}/${total}',a="support")
		f <<= nm.mcut(f="pid,pattern,size,count,total,support")
		f <<= nm.mvsort(vf="pattern")
		f <<= nm.msortf(f="pid",o=xxp0)
		f.run()


		# xxp0
		# pid,count,total,support,pattern
		# 0,13,13,1,A
		# 4,6,13,0.4615384615,A B
		xxp1=tf.file()

		# taxonomy指定がない場合(2010/11/20追加)
		if items.taxonomy==None:
			shutil.move(xxp0,xxp1)
		# taxonomy指定がある場合
		else:
			#MCMD::msgLog("reducing redundant rules in terms of taxonomy ...")

			zdd=VSOP.constant(0)
			fobj = nm.mcut(i=xxp0,f='pattern')
			for fldVal in fobj:
				zdd=zdd+VSOP.itemset(fldVal[0])

			
			zdd=self.reduceTaxo(zdd,self.db.items)
			xxz1=tf.file()
			xxz2=tf.file()
			zdd.csvout(xxz1)

			f0=None
			f0 <<= nm.mcut(nfni=True,f="1:pattern",i=xxz1)
			f0 <<= nm.mvsort(vf="pattern")
			f0 <<= nm.msortf(f="pattern")

			f=None
			f <<= nm.msortf(f="pattern",i=xxp0)
			f <<= nm.mcommon(k="pattern",m=f0)
			f <<= nm.msortf(f="pid",o=xxp1)
			f.run()


		# lift値の計算		
		transl = tf.file()
		extTake.lcmtrans(xxone,"p",transl)

		xxp2 =   nm.mdelnull(i=transl,f="pattern")
		xxp2 <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN)
		xxp2 <<= nm.msortf(f="pattern")

		xxp3 =   nm.mcut(f="pid,pattern",i=xxp1)
		xxp3 <<= nm.mtra(f="pattern",r=True)
		xxp3 <<= nm.mjoin(k="pattern",m=xxp2,f="count:c1")
		xxp3 <<= nm.mcal(c='ln(${c1})',a="c1ln")
		xxp3 <<= nm.msum(k="pid",f="c1ln")

		# p3
		# pid,pattern,c1,c1ln
		# 0,A,13,2.564949357
		# 1,E,7,1.945910149
		
		#おかしくなる?=>OK
		f3 =   nm.mjoin(k="pid",f="c1ln",i=xxp1,m=xxp3)
		f3 <<= nm.mcal(c='round(exp(ln(${count})-${c1ln}+(${size}-1)*ln(${total})),0.0001)',a="lift")
		f3 <<= nm.mcut(f="pid,size,count,total,support,lift,pattern")
		f3 <<= nm.msortf(f="support%nr",o=self.pFile)
		f3.run()

		#self.size = mrecount.mrecount(i=self.file)

		#MCMD::msgLog("the number of patterns enumerated is #{@size}")

		if not self.skipTP:
			# トランザクション毎に出現するシーケンスを書き出す
			#MCMD::msgLog("output tid-patterns ...")

			self.tFile = self.temp.file()
			xxw3i = tf.file()
			extTake.lcmtrans(lcmout,"t",xxw3i)

			xxw1 = nm.mcut(f=self.db.idFN,i=self.db.file).muniq(k=self.db.idFN).mnumber(S=0,a="__tid",q=True).msortf(f="__tid")
			xxw2 = nm.mcut(f="pid",i=self.pFile)
			xxw3 = nm.mcommon(k="pid",i=xxw3i,m=xxw2).mjoin(k="__tid",m=xxw1,f=self.db.idFN).mcut(f=self.db.idFN+",pid",o=self.tFile)
			xxw3.run()
Ejemplo n.º 12
0
    def run(self):
        wf = mtemp.Mtemp()
        xxpal = wf.file()
        xxa = wf.file()
        xxb = wf.file()
        xxc = wf.file()
        xxd = wf.file()
        xxout = wf.file()

        # ============
        # n1,n2,sim
        # a,b,0.40
        # a,c,0.31
        # a,d,0.22
        # b,c,0.20
        # b,d,0.24
        # b,e,0.14
        # c,d,0.30
        # d,e,0.09
        xpal = None
        if self.directed:
            # 任意の枝a->bのaについて上位rankを選択
            xpal <<= nm.mnumber(k=self.ef1,
                                s=self.sim + "%nr",
                                e="skip",
                                S=1,
                                a="##rank",
                                i=self.ei)
            xpal <<= nm.mselnum(f="##rank", c="[," + str(self.rank) + "]")
        else:
            xxa = nm.mfsort(f=[self.ef1, self.ef2], i=self.ei)
            xxb = nm.mfsort(f=[self.ef2, self.ef1], i=self.ei)
            xpal <<= nm.muniq(k=[self.ef1, self.ef2], i=[xxa, xxb])
            xpal <<= nm.mnumber(k=self.ef1,
                                s=self.sim + "%nr",
                                e="skip",
                                S=1,
                                a="##rank")
            xpal <<= nm.mselnum(f="##rank", c="[," + str(self.rank) + "]")

        # 両方向+片方向
        xpal1 = None
        if self.dir == "x":
            xpal1 <<= nm.mcut(f=[self.ef1, self.ef2, self.sim], i=xpal)
        # 両方向
        elif self.dir == "b":
            selpara = "$s{%s}==$s{##ef2}" % (self.ef1)
            # 得られた上位rankグラフからa->b->cを作成し、a==cであれば相思相愛ということ
            xpal1 <<= nm.mnjoin(k=self.ef2,
                                K=self.ef1,
                                m=xpal,
                                f=self.ef2 + ":##ef2," + self.sim + ":sim2",
                                i=xpal)
            xpal1 <<= nm.msel(c=selpara)
            xpal1 <<= nm.mcut(f=[self.ef1, self.ef2, self.sim])
        else:
            selpara = "$s{%s}==$s{##ef2}" % (self.ef1)
            xxc = None
            xxc <<= nm.mnjoin(k=self.ef2,
                              K=self.ef1,
                              m=xpal,
                              f=self.ef2 + ":##ef2," + self.sim + ":sim2",
                              i=xpal)
            xxc <<= nm.msel(c=selpara)
            xxc <<= nm.mcut(f=[self.ef1, self.ef2])
            xpal1 <<= nm.mcut(f=[self.ef1, self.ef2, self.sim], i=xpal)
            xpal1 <<= nm.mcommon(k=self.ef1 + "," + self.ef2, m=xxc, r=True)

        runpal = None
        kpara = "%s,%s" % (self.ef1, self.ef2)
        if self.udout:
            runpal <<= nm.mfsort(f=kpara, i=xpal1)
            runpal <<= nm.mavg(k=kpara, f=self.sim)
            runpal <<= nm.msortf(f=kpara, o=self.eo)
        else:
            runpal <<= nm.msortf(f=kpara, i=xpal1, o=self.eo)

        runpal.run()

        if self.ni and self.no:
            shutil.copyfile(self.ni, self.no)