コード例 #1
0
    def edge2mtx(self, ei, itra, map1, map2):

        p1 = nm.mcut(f=self.ef1, i=ei)
        p1 <<= nm.muniq(k=self.ef1)
        p1 <<= nm.mdelnull(f=self.ef1)
        p1 <<= nm.mnumber(q=True, a="num1", S=1, o=map1)

        p2 = nm.mcut(f=self.ef2, i=ei)
        p2 <<= nm.muniq(k=self.ef2)
        p2 <<= nm.mdelnull(f=self.ef2)
        p2 <<= nm.mnumber(q=True, a="num2", S=1, o=map2)

        runp = None
        runp <<= nm.mcut(f=[self.ef1, self.ef2], i=ei)
        runp <<= nm.mjoin(k=self.ef1, m=p1, f="num1")
        runp <<= nm.mjoin(k=self.ef2, m=p2, f="num2")
        runp <<= nm.mcut(f="num1,num2")
        runp <<= nm.mtra(k="num1", f="num2")
        runp <<= nm.msortf(f="num1%n")

        runp <<= nm.mcut(f="num2", nfno=True)
        runp <<= nm.cmd("tr ',' ' '")
        runp <<= nm.mwrite(o=itra)
        #runp <<= nm.mcut(f="num2",nfno=True,o=wff1)
        runp.run()
コード例 #2
0
    def pair2tra(self, ei, ef1, ef2, traFile, mapFile1, mapFile2):

        f1 = None
        f1 <<= nm.mcut(f="{}:node1".format(self.ef1), i=self.ei)
        f1 <<= nm.mdelnull(f="node1")
        f1 <<= nm.muniq(k="node1")
        f1 <<= nm.mnumber(s="node1", a="num1", o=mapFile1)

        f2 = None
        f2 <<= nm.mcut(f="{}:node2".format(self.ef2), i=self.ei)
        f2 <<= nm.mdelnull(f="node2")
        f2 <<= nm.muniq(k="node2")
        f2 <<= nm.mnumber(s="node2", a="num2", o=mapFile2)

        f3 = None
        f3 <<= nm.mcut(f="{}:node1,{}:node2".format(self.ef1, self.ef2),
                       i=self.ei)
        f3 <<= nm.mjoin(k="node1", m=f1, f="num1")
        f3 <<= nm.mjoin(k="node2", m=f2, f="num2")
        f3 <<= nm.mcut(f="num1,num2")
        f3 <<= nm.msortf(f="num1,num2%n")
        f3 <<= nm.mtra(k="num1", s="num2%n", f="num2")
        f3 <<= nm.msortf(f="num1%n")
        f3 <<= nm.mcut(f="num2", nfno=True, o=traFile)
        f3.run()
        os.system("cat " + traFile)
コード例 #3
0
def readCSV(iParams):

    iFile = iParams["iFile"]
    sidF = iParams["sid"]
    eidF = iParams["time"]
    itemF = iParams["item"]

    temp = Mtemp()
    xxdatPath = temp.file()
    mkDir(xxdatPath)

    # classファイルの処理
    if "cFile" in iParams:

        cFile = iParams["cFile"]
        csidF = iParams["csid"]
        classF = iParams["cNames"]

        f = None
        f <<= nm.mcut(f="%s:sid,%s:eid,%s:item" % (sidF, eidF, itemF), i=iFile)
        f <<= nm.mdelnull(f="sid,eid,item")
        f <<= nm.muniq(k="sid,eid,item")
        f <<= nm.mjoin(k="sid", K=csidF, m=cFile, f="%s:class" % (classF))
        f <<= nm.msep(s="sid,eid%n,item",
                      d="%s/${class}" % (xxdatPath),
                      p=True)
        f.run()

        classNames = glob.glob("%s/*" % (xxdatPath))
        classNames = [os.path.basename(path) for path in classNames]

    else:

        f = None
        f <<= nm.mcut(f="%s:sid,%s:eid,%s:item" % (sidF, eidF, itemF), i=iFile)
        f <<= nm.mdelnull(f="sid,eid,item")
        f <<= nm.muniq(k="sid,eid,item")
        f <<= nm.msortf(f="sid,eid%n,item", o="%s/single" % (xxdatPath))
        f.run()
        classNames = ["single"]

    datas = {}

    for name in classNames:
        dataFile = "%s/%s" % (xxdatPath, name)
        datas[name] = _readCSV_sub(dataFile)

    return datas
コード例 #4
0
 def _xnjoin(inf, outf, mfile, check, no):
     f = nm.mnjoin(k="node%d" % (no),
                   K="keyNum",
                   m=mfile,
                   n=True,
                   f="num:node%d" % (no + 1),
                   i=inf,
                   o=outf)
     fc = nm.mdelnull(i=f, f="node%d" % (no + 1), o=check)
     return fc
コード例 #5
0
    def run(self):

        tempW = mtemp.Mtemp()

        xxtra = tempW.file()
        xxmap1 = tempW.file()
        xxmap2 = tempW.file()
        lcmout = tempW.file()

        xxt0 = tempW.file()
        xxp0 = tempW.file()
        xx3t = tempW.file()
        xx4t = tempW.file()

        self.pair2tra(self.ei, self.ef1, self.ef2, xxtra, xxmap1, xxmap2)

        runPara = {}
        runPara["type"] = "CIf"
        runPara["sup"] = 1
        runPara["o"] = lcmout
        runPara["i"] = xxtra

        if self.minSize2:
            runPara["l"] = self.minSize2
        if self.maxSize2:
            runPara["u"] = self.maxSize2

        extTake.lcm(runPara)
        extTake.lcmtrans(lcmout, "p", xxt0)

        f = None
        f <<= nm.mdelnull(f="pattern", i=xxt0)
        f <<= nm.mvreplace(vf="pattern", m=xxmap2, K="num2", f="node2")
        f <<= nm.mcut(f="pid,pattern,size:size2")
        f <<= nm.mvsort(vf="pattern")
        f <<= nm.msortf(f="pid")

        if self.byedge:
            f_e0 = nm.mtra(f="pattern", i=f, r=True)
            extTake.lcmtrans(lcmout, "t", xx3t)

            f_e1 = None
            f_e1 <<= nm.mjoin(k="__tid", m=xxmap1, f="node1", K="num1", i=xx3t)
            f_e1 <<= nm.msortf(f="pid")
            ## xx2
            f_e2 = None
            f_e2 <<= nm.mcount(k="pid", a="size1", i=f_e1)
            f_e2 <<= nm.mselnum(f="size1",
                                c="[{},{}]".format(self.minSize1,
                                                   self.maxSize1))

            f_e3 = None
            f_e3 <<= nm.mjoin(k="pid", m=f_e2, f="size1", i=f_e1)
            f_e3 <<= nm.mnjoin(k="pid", m=f_e0, f="pattern,size2")
            f_e3 <<= nm.mcut(f="pid:id,node1:{},pattern:{},size1,size2".format(
                self.ef1, self.ef2),
                             o=self.oFile)
            f_e3.run()

        else:

            extTake.lcmtrans(lcmout, "t", xx4t)
            f_e4 = None
            f_e4 <<= nm.mjoin(k="__tid", m=xxmap1, i=xx4t, f="node1", K="num1")
            f_e4 <<= nm.mtra(k="pid", f="node1")
            f_e4 <<= nm.mvcount(vf="node1:size1")
            f_e4 <<= nm.mjoin(k="pid", m=f, f="pattern,size2")
            f_e4 <<= nm.mselnum(f="size1",
                                c="[{},{}]".format(self.minSize1,
                                                   self.maxSize1))
            f_e4 <<= nm.mvsort(vf="node1,pattern")
            f_e4 <<= nm.msortf(f="node1,pattern")
            f_e4 <<= nm.mcut(f="node1:{},pattern:{},size1,size2".format(
                self.ef1, self.ef2),
                             o=self.oFile)
            f_e4.run()
コード例 #6
0
	def enumerate(self,eArgs):
		tf=nu.Mtemp()

		# 最小サポートと最小サポート件数
		if "minCnt" in eArgs :
			self.minCnt = int(eArgs["minCnt"])
			self.minSup = float(self.minCnt)/ float(self.db.size)
		else:
			self.minSup = float(eArgs["minSup"])
			self.minCnt = int(self.minSup * float(self.db.size) + 0.99)


		# 最大サポートと最大サポート件数
		self.maxCnt=None
		if "maxCnt" in eArgs or "maxSup" in eArgs:
			if "maxCnt" in eArgs:
				self.maxCnt = int(eArgs["maxCnt"])
				self.maxSup = float(self.maxCnt)/float(self.db.size)
			else:
				self.maxSup = float(eArgs["maxSup"])
				self.maxCnt = int(self.maxSup * float(self.db.size) + 0.99)

		#未使用
		#@minProb = eArgs["minProb"].to_f # 事後確率
		#@minGR   = @minProb/(1-@minProb) # 増加率
		#@minGR   = eArgs["minGR"].to_f if eArgs["minGR"]

		# あるクラスをpos、他のクラスをnegにして、パターン列挙した結果ファイル名を格納する
		pFiles=[]
		tFiles=[]
		for cName,posSize in self.db.clsNameRecSize.items(): 
			negSize=self.db.size-posSize

			# minGRの計算
			if "minGR" in eArgs:
				self.minGR=eArgs["minGR"]
			else:
				minProb = eArgs["minProb"] if ( "minProb" in eArgs ) else 0.5
				if "uniform" in eArgs and eArgs["uniform"]:
					self.minGR = (minProb/(1-minProb)) * (self.db.clsSize-1) # マニュアルの式(4)
				else:
					self.minGR = (minProb/(1-minProb)) * (float(negSize)/float(posSize)) # マニュアルの式(4)


			# 最小サポートと最小サポート件数
			if "minCnt" in eArgs:
				self.minPos = eArgs["minCnt"]
			else:
				self.minPos = int(eArgs["minSup"] * float(posSize) + 0.99)

			# 最大サポートと最大サポート件数
			if "maxCnt" in eArgs or "maxSup" in eArgs:
				if "maxCnt" in eArgs:
					self.maxCnt = int(eArgs["maxCnt"])
				else:
 					self.maxCnt = int(eArgs["maxSup"] * float(posSize) + 0.99)


			self.sigma[cName] = self.calSigma(self.minPos,self.minGR,posSize,negSize)

			# lcm_seqのパラメータ設定と実行
			lcmout = tf.file() # lcm_seq出力ファイル
			# 頻出パターンがなかった場合、lcm出力ファイルが生成されないので
			# そのときのために空ファイルを生成しておいく。
			with open(lcmout, "w") as efile:
				pass

			params = {}
			if self.msgoff:
				params["type"] ="CIA_"
			else:
				params["type"] ="CIA"

			if self.maxCnt: # windowサイズ上限
				params["U"] = str(self.maxCnt)
			if "minLen" in eArgs:
				params["l"] = str(eArgs["minLen"])
			if 'maxLen' in eArgs:
				params["u"] = str(eArgs["maxLen"])
			if 'gap' in eArgs:
				params["g"] = str(eArgs["gap"])
			if 'win' in eArgs:
				params["G"] = str(eArgs["win"])

			params["w"] = self.weightFile[cName]
			params["i"] = self.file
			params["sup"] = str(self.sigma[cName])
			params["o"] = lcmout

			# lcm_seq実行
			#MCMD::msgLog("#{run}")
			if 'padding' in eArgs and eArgs["padding"]: # padding指定時は、0アイテムを出力しないlcm_seqを実行
				extTake.lcmseq_zero(params)
			else:
				extTake.lcmseq(params)

			# パターンのサポートを計算しCSV出力する
			#MCMD::msgLog("output patterns to CSV file ...")
			pFiles.append(self.temp.file())
			transle = self.temp.file()

			extTake.lcmtrans(lcmout,"e",transle) # pattern,countP,countN,size,pid

			f=None
			f <<= nm.mdelnull(f="pattern",i=transle)
			f <<= nm.mcal(c='round(${countN},1)',a="neg")
			f <<= nm.mcal(c='round(${countP}/%s,1)'%(self.posWeight[cName]),a="pos")
			f <<= nm.mdelnull(f="pattern")
			f <<= nm.msetstr(v=cName,a="class")
			f <<= nm.msetstr(v=posSize,a="posTotal")
			f <<= nm.msetstr(v=self.minGR,a="minGR")
			f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,minGR",o=pFiles[-1])
			f.run()

			#s = MCMD::mrecount("i=#{pFiles.last}") # 列挙されたパターンの数
			#MCMD::msgLog("the number of contrast patterns on class `#{cName}' enumerated is #{s}")

			if self.outtf :
				# トランザクション毎に出現するシーケンスを書き出す
				#MCMD::msgLog("output tid-patterns ...")
				tFiles.append(self.temp.file())

				xxw= tf.file()
				f=None
				f <<= nm.mcut(f=self.db.idFN,i=self.db.file)
				f <<= nm.muniq(k=self.db.idFN)
				f <<= nm.mnumber(S=0,a="__tid",q=True)
				f <<= nm.msortf(f="__tid",o=xxw)
				f.run()

				nm.mcut(f=self.db.idFN,i=self.db.file).muniq(k=self.db.idFN).mnumber(S=0,a="__tid",q=True,o=xxw).run()
				translt = self.temp.file()
				extTake.lcmtrans(lcmout,"t",translt)
				nm.mjoin(k="__tid",m=xxw,f=self.db.idFN,i=translt).msetstr(v=cName,a="class").mcut(f=self.db.idFN+",class,pid",o=tFiles[-1]).run()


		# クラス別のパターンとtid-pidファイルを統合して最終出力
		self.pFile = self.temp.file()
		self.tFile = self.temp.file()

		# パターンファイル併合
		xxpCat = tf.file()
		f =   nm.mcat(i=",".join(pFiles))
		f <<= nm.msortf(f="class,pid")
		f <<= nm.mnumber(s="class,pid",S=0,a="ppid",o=xxpCat)
		f.run()

		# パターンファイル計算
		items=self.db.items
		f=""
		f =   nm.mcut(f="class,ppid:pid,pattern,size,pos,neg,posTotal,minGR",i=xxpCat)
		f <<= nm.msetstr(v=self.db.size,a="total")
		f <<= nm.mcal(c='${total}-${posTotal}',a="negTotal") # negのトータル件数
		f <<= nm.mcal(c='${pos}/${posTotal}',a="support") # サポートの計算
		f <<= nm.mcal(c='if(${neg}==0,1.797693135e+308,(${pos}/${posTotal})/(${neg}/${negTotal}))',a="growthRate")
		if "uniform" in eArgs and eArgs["uniform"] == True:
			f <<= nm.mcal(c='(${pos}/${posTotal})/(${pos}/${posTotal}+(%s-1)*${neg}/${negTotal})'%(self.db.clsSize),a="postProb")
		else:
			f <<= nm.mcal(c='${pos}/(${pos}+${neg})',a="postProb")

		f <<= nm.msel(c='${pos}>=%s&&${growthRate}>=${minGR}'%(self.minPos)) # minSupとminGRによる選択
		f <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN)
		f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,negTotal,total,support,growthRate,postProb")
		f <<= nm.mvsort(vf="pattern")
		f <<= nm.msortf(f="class%nr,postProb%nr,pos%nr",o=self.pFile)
		f.run()

		if self.outtf :
			# 列挙されたパターンを含むtraのみ選択するためのマスタ
			xxp4=nm.mcut(f="class,pid",i=self.pFile)

			f =   nm.mcat(i=",".join(tFiles))
			f <<= nm.mjoin(k="class,pid",m=xxpCat,f="ppid") # 全クラス統一pid(ppid)結合
			f <<= nm.mcommon(k="class,ppid",K="class,pid",m=xxp4) # 列挙されたパターンの選択
			f <<= nm.mcut(f=self.db.idFN+",class,ppid:pid")
			f <<= nm.msortf(f=self.db.idFN+",class,pid",o=self.tFile)
			f.run()


		self.size = nu.mrecount(i=self.pFile)
コード例 #7
0
    def enumerate(self, eArgs):
        tf = mtemp.Mtemp()

        # 最小サポートと最小サポート件数
        if "minCnt" in eArgs and eArgs["minCnt"] != None:
            self.minCnt = int(eArgs["minCnt"])
            self.minSup = float(self.minCnt) / float(self.db.size)
        else:
            self.minSup = float(eArgs["minSup"])
            self.minCnt = int(self.minSup * float(self.db.size) + 0.99)

        # 最大サポートと最大サポート件数
        self.maxCnt = None
        if ("maxCnt" in eArgs
                and eArgs["maxCnt"] != None) or ("maxSup" in eArgs
                                                 and eArgs["maxSup"] != None):
            if "maxCnt" in eArgs and eArgs["maxCnt"] != None:
                self.maxCnt = int(eArgs["maxCnt"])
                self.maxSup = float(self.maxCnt) / float(self.db.size)
            else:
                self.maxSup = float(eArgs["maxSup"])
                self.maxCnt = int(self.maxSup * float(self.db.size) + 0.99)

        # 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る
        if "top" in eArgs and eArgs["top"] != None:
            self.top = eArgs["top"]

        # 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る
        if self.top and self.top > 0:

            xxtop = tf.file()

            extTake.lcmseq(type="Cf",
                           K=str(self.top),
                           i=self.file,
                           sup="1",
                           so=xxtop)

            with open(xxtop, "r") as rfile:
                self.minCnt = int(rfile.read().strip())

        # lcm_seq出力ファイル
        lcmout = tf.file()
        # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので
        # そのときのために空ファイルを生成しておいく。
        with open(lcmout, "w") as efile:
            pass

        # lcm_seqのパラメータ設定と実行
        params = {}
        if self.msgoff:
            params["type"] = "CIf_"
        else:
            params["type"] = "CIf"

        if self.maxCnt:
            params["U"] = str(self.maxCnt)
        if "minLen" in eArgs:
            params["l"] = str(eArgs["minLen"])
        if 'maxLen' in eArgs:
            params["u"] = str(eArgs["maxLen"])
        if 'gap' in eArgs:
            params["g"] = str(eArgs["gap"])
        if 'win' in eArgs:
            params["G"] = str(eArgs["win"])

        params["i"] = self.file
        params["sup"] = str(self.minCnt)
        params["o"] = lcmout

        # lcm_seq実行
        #MCMD::msgLog("#{run}")
        if 'padding' in eArgs and eArgs[
                "padding"]:  # padding指定時は、0アイテムを出力しないlcm_seqを実行
            extTake.lcmseq_zero(params)
        else:
            extTake.lcmseq(params)

        # パターンのサポートを計算しCSV出力する
        self.pFile = self.temp.file()
        items = self.db.items

        transl = self.temp.file()
        extTake.lcmtrans(lcmout, "p", transl)

        f = nm.mdelnull(f="pattern", i=transl)
        f <<= nm.mvreplace(vf="pattern",
                           m=items.file,
                           K=items.idFN,
                           f=items.itemFN)
        f <<= nm.msetstr(v=self.db.size, a="total")
        f <<= nm.mcal(c='${count}/${total}', a="support")  # サポートの計算
        f <<= nm.mcut(f="pid,pattern,size,count,total,support")
        f <<= nm.msortf(f="support%nr", o=self.pFile)
        f.run()

        if self.outtf:
            # トランザクション毎に出現するシーケンスを書き出す
            #MCMD::msgLog("output tid-patterns ...")
            self.tFile = self.temp.file()

            xxw = tf.file()  #Mtemp.new.name
            f = None
            f <<= nm.mcut(f=self.db.idFN, i=self.db.file)
            f <<= nm.muniq(k=self.db.idFN)
            f <<= nm.mnumber(S=0, a="__tid", q=True)
            f <<= nm.msortf(f="__tid", o=xxw)
            f.run()

            translt = self.temp.file()
            extTake.lcmtrans(lcmout, "t", translt)

            f = None
            f <<= nm.msortf(f="__tid", i=translt)
            f <<= nm.mjoin(k="__tid", m=xxw, f=self.db.idFN)
            f <<= nm.mcut(f=self.db.idFN + ",pid")
            f <<= nm.msortf(f=self.db.idFN + ",pid", o=self.tFile)
            f.run()
コード例 #8
0
    def enumerate(self, eArgs):

        pFiles = []
        tFiles = []
        tf = mtemp.Mtemp()
        for cName, posSize in self.db.clsNameRecSize.items():
            negSize = self.db.traSize - posSize
            if "minGR" in eArgs:
                self.minGR = eArgs["minGR"]
            else:
                minProb = eArgs["minProb"] if ("minProb" in eArgs) else 0.5
                if "uniform" in eArgs and eArgs["uniform"] == True:
                    self.minGR = (minProb / (1 - minProb)) * (
                        self.db.clsSize - 1)  # マニュアルの式(4)
                else:
                    self.minGR = (minProb / (1 - minProb)) * (
                        float(negSize) / float(posSize))  # マニュアルの式(4)

            # 最小サポートと最小サポート件数
            # s=0.05
            # s=c1:0.05,c2:0.06
            # S=10
            # S=c1:10,c2:15
            if "minCnt" in eArgs:
                if isinstance(eArgs["minCnt"], dict):
                    self.minPos = eArgs["minCnt"][cName]
                else:
                    self.minPos = eArgs["minCnt"]
            else:
                if isinstance(eArgs["minSup"], dict):
                    self.minPos = int(eArgs["minSup"][cName] * float(posSize) +
                                      0.99)
                else:
                    self.minPos = int(eArgs["minSup"] * flost(posSize) + 0.99)

            # 最大サポートと最大サポート件数
            if "maxCnt" in eArgs:
                if isinstance(eArgs["maxCnt"], dict):
                    self.maxPos = eArgs["maxCnt"][cName]
                else:
                    self.maxPos = eArgs["maxCnt"]

            elif "maxSup" in eArgs:
                if isinstance(eArgs["maxSup"], dict):
                    self.maxPos = int(eArgs["maxSup"][cName] * float(posSize) +
                                      0.99)
                else:
                    self.maxPos = int(eArgs["maxSup"] * float(posSize) + 0.99)
            else:
                self.maxPos = None

            self.sigma[cName] = self.calSigma(self.minPos, self.minGR, posSize,
                                              negSize)

            # lcmのパラメータ設定と実行
            # 頻出パターンがなかった場合、lcm出力ファイルが生成されないので
            # そのときのために空ファイルを生成しておいく。
            lcmout = tf.file()  # lcm出力ファイル
            with open(lcmout, "w") as efile:
                pass

            runPara = {}

            if self.msgoff:
                runPara["type"] = eArgs["type"] + "IA_"
            else:
                runPara["type"] = eArgs["type"] + "IA"

            #if self.maxPos: #rubyだとif @maxCntなってる(どこにも設定されてないので)動いてないはず
            if self.maxPos:
                runPara["U"] = self.maxPos

            if "minLen" in eArgs:
                runPara["l"] = str(eArgs["minLen"])

            if "maxLen" in eArgs:
                runPara["u"] = str(eArgs["maxLen"])

            runPara["w"] = self.weightFile[cName]

            runPara["i"] = self.file

            runPara["sup"] = str(self.sigma[cName])

            runPara["o"] = lcmout

            # lcm実行
            #MCMD::msgLog("#{run}")
            #TAKE::run_lcm(run)
            #print(self.sigma)
            #print(runPara)
            #MCMD::msgLog("output patterns to CSV file ...")

            extTake.lcm(runPara)

            pFiles.append(self.temp.file())

            transle = tf.file()
            extTake.lcmtrans(lcmout, "e", transle)

            f = nm.mdelnull(f="pattern", i=transle)
            f <<= nm.mcal(c='round(${countN},1)', a="neg")
            f <<= nm.mcal(c='round(${countP}/%s,1)' % (self.posWeight[cName]),
                          a="pos")
            f <<= nm.mdelnull(f="pattern")  #いる?
            f <<= nm.msetstr(v=cName, a="class")
            f <<= nm.msetstr(v=posSize, a="posTotal")
            f <<= nm.msetstr(v=self.minGR, a="minGR")
            f <<= nm.mcut(f="class,pid,pattern,size,pos,neg,posTotal,minGR",
                          o=pFiles[-1])
            f.run()

            #s = nutil.mrecount(i=self.file)
            #MCMD::msgLog("the number of contrast patterns on class `#{cName}' enumerated is #{s}")

            if self.outtf:
                # トランザクション毎に出現するパターンを書き出す
                #MCMD::msgLog("output tid-patterns ...")
                tFiles.append(self.temp.file())
                xxw = tf.file()

                xxw = nm.mcut(f=self.db.idFN, i=self.db.file)
                xxw <<= nm.muniq(k=self.db.idFN)
                xxw <<= nm.mnumber(S=0, a="__tid", q=True)

                translt = self.temp.file()
                extTake.lcmtrans(lcmout, "t", translt)

                f = nm.mjoin(k="__tid", m=xxw, f=self.db.idFN, i=translt)
                f <<= nm.msetstr(v=cName, a="class")
                f <<= nm.mcut(f=self.db.idFN + ",class,pid", o=tFiles[-1])
                f.run()

        # クラス別のパターンとtid-pidファイルを統合して最終出力
        self.pFile = self.temp.file()
        self.tFile = self.temp.file()

        # パターンファイル併合
        xxpCat = tf.file()
        f = nm.mcat(i=",".join(pFiles))
        f <<= nm.msortf(f="class,pid")
        f <<= nm.mnumber(s="class,pid", S=0, a="ppid", o=xxpCat)
        f.run()

        # パターンファイル計算
        items = self.db.items
        f = nm.mcut(f="class,ppid:pid,pattern,size,pos,neg,posTotal,minGR",
                    i=xxpCat)
        f <<= nm.msetstr(v=self.db.traSize, a="total")
        f <<= nm.mcal(c='${total}-${posTotal}', a="negTotal")  # negのトータル件数
        f <<= nm.mcal(c='${pos}/${posTotal}', a="support")  # サポートの計算
        f <<= nm.mcal(
            c=
            'if(${neg}==0,1.797693135e+308,(${pos}/${posTotal})/(${neg}/${negTotal}))',
            a="growthRate")

        if "uniform" in eArgs and eArgs["uniform"] == True:
            f <<= nm.mcal(
                c='(${pos}/${posTotal})/(${pos}/${posTotal}+(%s-1)*${neg}/${negTotal})'
                % (self.db.clsSize),
                a="postProb")
        else:
            f <<= nm.mcal(c='${pos}/(${pos}+${neg})', a="postProb")

        f <<= nm.msel(c='${pos}>=%s&&${growthRate}>=${minGR}' %
                      (self.minPos))  # minSupとminGRによる選択
        f <<= nm.mvreplace(vf="pattern",
                           m=items.file,
                           K=items.idFN,
                           f=items.itemFN)
        f <<= nm.mcut(
            f="class,pid,pattern,size,pos,neg,posTotal,negTotal,total,support,growthRate,postProb"
        )
        f <<= nm.mvsort(vf="pattern")
        f <<= nm.msortf(f="class%nr,postProb%nr,pos%nr", o=self.pFile)
        f.run()

        # アイテムを包含している冗長なタクソノミを削除
        if items.taxonomy:
            #MCMD::msgLog("reducing redundant rules in terms of taxonomy ...")
            ##ここは後で
            zdd = VSOP.constant(0)
            dt = nm.mcut(i=self.pFile, f="pattern")

            for fldVal in dt:
                zdd = zdd + VSOP.itemset(fldVal[0])

            zdd = self.reduceTaxo(zdd, self.db.items)

            xxp1 = tf.file()
            xxp2 = tf.file()
            xxp3 = tf.file()
            zdd.csvout(xxp1)

            nm.mcut(nfni=True, f="1:pattern",
                    i=xxp1).mvsort(vf="pattern").msortf(f="pattern",
                                                        o=xxp2).run()
            nm.msortf(f="pattern", i=self.pFile).mcommon(
                k="pattern", m=xxp2).msortf(f="class%nr,postProb%nr,pos%nr",
                                            o=xxp3).run()
            shutil.move(xxp3, self.pFile)

        if self.outtf:
            # 列挙されたパターンを含むtraのみ選択するためのマスタ
            xxp4 = nm.mcut(f="class,pid", i=self.pFile)
            f = nm.mcat(i=",".join(tFiles))
            f <<= nm.mjoin(k="class,pid", m=xxpCat,
                           f="ppid")  # 全クラス統一pid(ppid)結合
            f <<= nm.mcommon(k="class,ppid", K="class,pid",
                             m=xxp4)  # 列挙されたパターンの選択
            f <<= nm.mcut(f=self.db.idFN + ",class,ppid:pid")
            f <<= nm.msortf(f=self.db.idFN + ",class,pid", o=self.tFile)
            f.run()
コード例 #9
0
    def __init__(self, nc, ni, col, order):

        self.nc = nc
        self.ni = ni
        self.col = col
        self.range = 0
        self.min = 0
        self.max = 0
        self.order = order

        if self.nc and self.ni:

            if self.col == "category":

                self.type = "category"
                # preparing a color pallet
                pallet = []
                val = [
                    "FF", "80", "C0", "40", "E0", "60", "A0", "20", "F0", "70",
                    "B0", "30", "D0", "50", "90", "10"
                ]
                for v in val:
                    pallet.append("%s0000" % (v))
                    pallet.append("00%s00" % (v))
                    pallet.append("0000%s" % (v))
                    pallet.append("%s%s00" % (v, v))
                    pallet.append("00%s%s" % (v, v))
                    pallet.append("%s00%s" % (v, v))

                # read color field data and make a mapping table(data to pallet)
                f = None
                f <<= nm.mcut(f="%s:ckey" % (self.nc), i=self.ni)
                f <<= nm.mdelnull(f="ckey")
                f <<= nm.mcount(k="ckey", a="freq")

                if self.order == "descend":
                    f <<= nm.mbest(s="freq%nr,ckey", fr=0,
                                   size=96)  #,o=#{xxcTable}
                elif self.order == "ascend":
                    f <<= nm.mbest(s="freq%n,ckey", fr=0,
                                   size=96)  # o=#{xxcTable}"
                else:
                    f <<= nm.mbest(s="ckey", fr=0, size=96)  #o=#{xxcTable}"

                self.cTable = {}
                i = 0
                for flds in f.getline(otype="dict"):
                    cK = flds["ckey"]
                    self.cTable[cK] = pallet[i]
                    i += 1

            else:
                self.type = "numeric"
                ary = col.split(",")
                if len(ary) != 2 or len(ary[0]) != 6 or len(ary[1]) != 6:
                    raise ValueError(
                        "col= takes two 6-digites HEX codes like FF0000,00FF00"
                    )

                self.r0 = int(ary[0][0:2], 16)
                self.g0 = int(ary[0][2:4], 16)
                self.b0 = int(ary[0][4:6], 16)
                self.r1 = int(ary[1][0:2], 16)
                self.g1 = int(ary[1][2:4], 16)
                self.b1 = int(ary[1][4:6], 16)

                f = None
                f <<= nm.mcut(f="%s:ckey" % (self.nc), i=self.ni)
                f <<= nm.mdelnull(f="ckey")
                f <<= nm.msummary(f="ckey", c="min,max")
                xxcTable = f.run()
                if len(xxcTable) > 0:
                    if len(xxcTable[0]) >= 2:
                        if xxcTable[0][1] != "":
                            self.min = float(xxcTable[0][1])
                        if xxcTable[0][2] != "":
                            self.max = float(xxcTable[0][2])

                        if xxcTable[0][1] != "" and xxcTable[0][2] != "":
                            self.range = self.max - self.min
コード例 #10
0
	def enumerate(self,eArgs):
		"""
		eArgsで与えられた条件で、頻出アイテム集合の列挙を実行する。

		:type eArgs: dict
		:type eArgs['type']: str
		:type eArgs['minCnt']: int
		:type eArgs['minSup']: float
		:type eArgs['maxCnt']: int
		:type eArgs['maxSup']: float
		:type eArgs['minLen']: int
		:type eArgs['maxLen']: int
		:type eArgs['top']: int
		:type eArgs['skipTP']: bool【default:False】
		:param eArgs: 各種列挙パラメータ
		:param eArgs['type']: 抽出するアイテム集合の型【'F':頻出集合, 'C':飽和集合, 'M':極大集合】
		:param eArgs['minCnt']: 最小サポート(件数)
		:param eArgs['minSup']: 最小サポート(確率)
		:param eArgs['maxCnt']: 最大サポート(件数)
		:param eArgs['maxSup']: 最大サポート(確率)
		:param eArgs['minLen']: アイテム集合の最小アイテム数(件数)
		:param eArgs['maxLen']: アイテム集合の最大アイテム数(件数)
		:param eArgs['top']: 列挙するサポート上位件数(件数)
		:param eArgs['skipTP']: トランザクションにマッチするパターン(アイテム集合)の出力を行わない。
		"""

		tf=mtemp.Mtemp()
		self.eArgs=eArgs
		self.type = eArgs["type"]

		if "minCnt" in eArgs and eArgs["minCnt"] != None:
			self.minCnt = int(eArgs["minCnt"])
			self.minSup = float(self.minCnt) / float(self.db.traSize)
		else:
			self.minSup = float(eArgs["minSup"])
			self.minCnt = int(self.minSup * float(self.db.traSize) + 0.99)

		# 最大サポートと最大サポート件数
		self.maxCnt=None
		if ("maxCnt" in eArgs and  eArgs["maxCnt"]!= None) or ( "maxSup" in eArgs and eArgs["maxSup"]!= None):
			if "maxCnt" in eArgs and eArgs["maxCnt"]!= None:
				self.maxCnt = int(eArgs["maxCnt"])
				self.maxSup = float(self.maxCnt) / float(self.db.traSize)
			else:
				self.maxSup    = float(eArgs["maxSup"])
				self.maxCnt = int(self.maxSup * float(self.db.traSize) + 0.99)


		params = {}
		if self.msgoff:
			params["type"] ="%sIf_"%(self.type)
		else:
			params["type"] ="%sIf"%(self.type)


		if self.maxCnt :
			params["U"] = str(self.maxCnt)

		if "minLen" in eArgs and eArgs["minLen"] != None :
			params["l"] = str(eArgs['minLen'])
		
		if "maxLen" in eArgs and eArgs["maxLen"] != None :
			params["u"] = str(eArgs['maxLen'])

		# 列挙パターン数上限が指定されれば、一度lcmを実行して最小サポートを得る
		if "top" in eArgs and eArgs["top"] != None :
			self.top = eArgs["top"]

		if self.top and self.top>0 :

			xxtop = tf.file()
			import copy
			top_params = copy.deepcopy(params)
			top_params["i"] = self.file
			top_params["sup"] = "1"
			top_params["K"] = str(self.top)
			top_params["so"] = xxtop
			import re
			top_params["type"] = re.sub('_$', '', top_params["type"] )

			extTake.lcm(top_params)

			with open(xxtop, "r") as rfile:
				self.minCnt = int(rfile.read().strip())

			if self.minCnt<0 :
				self.minCnt=1 


		self.skipTP=False
		if "skipTP" in eArgs:
			self.skipTP=eArgs["skipTP"]

		# lcm_seq出力ファイル
		lcmout = tf.file()

		# 頻出パターンがなかった場合、lcm出力ファイルが生成されないので
		# そのときのために空ファイルを生成しておいく。
		with open(lcmout, "w") as efile:
			pass

		# lcm実行
		params["i"] = self.file
		params["sup"] = str(self.minCnt)
		params["o"] = lcmout
		extTake.lcm(params)

		# caliculate one itemset for lift value
		xxone= tf.file()
		tpstr = "FIf_" if self.msgoff else "FIf"

		extTake.lcm(type=tpstr,i=self.file,sup=1,o=xxone,l=1,u=1)


		# パターンのサポートを計算しCSV出力する
		#MCMD::msgLog("output patterns to CSV file ...")

		xxp0 = tf.file()
		self.pFile = self.temp.file()
		items=self.db.items
		trans0 = self.temp.file()

		extTake.lcmtrans(lcmout,"p",trans0)

		f =   nm.mdelnull(i=trans0,f="pattern")
		f <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN)
		f <<= nm.msetstr(v=self.db.traSize,a="total")
		f <<= nm.mcal(c='${count}/${total}',a="support")
		f <<= nm.mcut(f="pid,pattern,size,count,total,support")
		f <<= nm.mvsort(vf="pattern")
		f <<= nm.msortf(f="pid",o=xxp0)
		f.run()


		# xxp0
		# pid,count,total,support,pattern
		# 0,13,13,1,A
		# 4,6,13,0.4615384615,A B
		xxp1=tf.file()

		# taxonomy指定がない場合(2010/11/20追加)
		if items.taxonomy==None:
			shutil.move(xxp0,xxp1)
		# taxonomy指定がある場合
		else:
			#MCMD::msgLog("reducing redundant rules in terms of taxonomy ...")

			zdd=VSOP.constant(0)
			fobj = nm.mcut(i=xxp0,f='pattern')
			for fldVal in fobj:
				zdd=zdd+VSOP.itemset(fldVal[0])

			
			zdd=self.reduceTaxo(zdd,self.db.items)
			xxz1=tf.file()
			xxz2=tf.file()
			zdd.csvout(xxz1)

			f0=None
			f0 <<= nm.mcut(nfni=True,f="1:pattern",i=xxz1)
			f0 <<= nm.mvsort(vf="pattern")
			f0 <<= nm.msortf(f="pattern")

			f=None
			f <<= nm.msortf(f="pattern",i=xxp0)
			f <<= nm.mcommon(k="pattern",m=f0)
			f <<= nm.msortf(f="pid",o=xxp1)
			f.run()


		# lift値の計算		
		transl = tf.file()
		extTake.lcmtrans(xxone,"p",transl)

		xxp2 =   nm.mdelnull(i=transl,f="pattern")
		xxp2 <<= nm.mvreplace(vf="pattern",m=items.file,K=items.idFN,f=items.itemFN)
		xxp2 <<= nm.msortf(f="pattern")

		xxp3 =   nm.mcut(f="pid,pattern",i=xxp1)
		xxp3 <<= nm.mtra(f="pattern",r=True)
		xxp3 <<= nm.mjoin(k="pattern",m=xxp2,f="count:c1")
		xxp3 <<= nm.mcal(c='ln(${c1})',a="c1ln")
		xxp3 <<= nm.msum(k="pid",f="c1ln")

		# p3
		# pid,pattern,c1,c1ln
		# 0,A,13,2.564949357
		# 1,E,7,1.945910149
		
		#おかしくなる?=>OK
		f3 =   nm.mjoin(k="pid",f="c1ln",i=xxp1,m=xxp3)
		f3 <<= nm.mcal(c='round(exp(ln(${count})-${c1ln}+(${size}-1)*ln(${total})),0.0001)',a="lift")
		f3 <<= nm.mcut(f="pid,size,count,total,support,lift,pattern")
		f3 <<= nm.msortf(f="support%nr",o=self.pFile)
		f3.run()

		#self.size = mrecount.mrecount(i=self.file)

		#MCMD::msgLog("the number of patterns enumerated is #{@size}")

		if not self.skipTP:
			# トランザクション毎に出現するシーケンスを書き出す
			#MCMD::msgLog("output tid-patterns ...")

			self.tFile = self.temp.file()
			xxw3i = tf.file()
			extTake.lcmtrans(lcmout,"t",xxw3i)

			xxw1 = nm.mcut(f=self.db.idFN,i=self.db.file).muniq(k=self.db.idFN).mnumber(S=0,a="__tid",q=True).msortf(f="__tid")
			xxw2 = nm.mcut(f="pid",i=self.pFile)
			xxw3 = nm.mcommon(k="pid",i=xxw3i,m=xxw2).mjoin(k="__tid",m=xxw1,f=self.db.idFN).mcut(f=self.db.idFN+",pid",o=self.tFile)
			xxw3.run()