Ejemplo n.º 1
0
def toNum():
    for size in ["all"]:
        iFile = "%s/online_all.csv" % datPath
        oFile1 = "%s/onlineT_all.csv" % datPath  # data for Take.core
        oFile2 = "%s/onlineO_all.basket" % datPath  # data for Orange
        oFile3 = "%s/onlineM_all.csv" % datPath  # data for Take

        f = None
        f <<= nm.mcut(f="InvoiceNo,StockCode", i=iFile)
        f <<= nm.muniq(k="InvoiceNo,StockCode")
        f <<= nm.mfldname(q=True, o=oFile3)
        f.run(msg=debug)

        st = None
        st <<= nm.mcut(f="StockCode", i=iFile)
        st <<= nm.muniq(k="StockCode")
        st <<= nm.mnumber(s="StockCode", a="num")
        f = None
        f <<= nm.mjoin(k="StockCode", m=st, f="num", i=iFile)
        f <<= nm.mcut(f="InvoiceNo,num:StockCode")
        f <<= nm.mtra(k="InvoiceNo", f="StockCode")
        f <<= nm.mcut(f="StockCode", nfno=True, o=oFile1)
        f.run(msg=debug)

        os.system("tr ' ' ',' <%s >%s" % (oFile1, oFile2))
Ejemplo n.º 2
0
def calTime(iFile, oFile):
    f = None
    f <<= nm.mnumber(q=True, a="id", i=iFile)
    f <<= nm.mcal(c='$s{mean}+"("+$s{sd}+")"', a="time")
    f <<= nm.m2cross(k="method", s="dataSize", f="time")
    f <<= nm.msortf(f="id%n")
    f <<= nm.mcut(f="method,10000:small,1000000:middle,100000000:large")
    f <<= nm.mfldname(q=True, o=oFile)
    f.run()
Ejemplo n.º 3
0
    def convRsl(self, ifile, ofile, map1, map2, logDir=None):

        # 上記iterationで収束したマイクロクラスタグラフを元の節点文字列に直して出力する
        #MCMD::msgLog("converting the numbered nodes into original name ...")
        f = None
        f <<= nm.mcut(nfni=True, f="0:tra", i=ifile)
        f <<= nm.msed(f="tra", c=' $', v="")
        f <<= nm.mnumber(q=True, S=1, a="num1")
        f <<= nm.mtra(r=True, f="tra:num2")
        f <<= nm.mjoin(k="num2", m=map2, f=self.ef2)
        f <<= nm.mjoin(k="num1", m=map1, f=self.ef1)
        f <<= nm.msortf(f="num1%n,num2%n")
        f <<= nm.mcut(f=[self.ef1, self.ef2])

        if logDir:
            f <<= nm.mfldname(q=True, o="{}/#{ofile}".format(logDir, ofile))
        else:
            f <<= nm.mfldname(q=True, o=ofile)

        f.run()
Ejemplo n.º 4
0
def calRelative(iFile, oFile):
    mcut = None
    mcut <<= nm.mselstr(f="method", v="mcut", i="methods.csv")

    f = None
    f <<= nm.mnumber(q=True, a="id", i=iFile)
    f <<= nm.mjoin(k="dataSize", m=mcut, f="mean:base")
    f <<= nm.mcal(c='round(${mean}/${base},0.1)', a="score")
    f <<= nm.m2cross(k="method", s="dataSize", f="score")
    f <<= nm.msortf(f="id%n")
    f <<= nm.mcut(f="method,10000:small,1000000:middle,100000000:large")
    f <<= nm.mfldname(q=True, o=oFile)
    f.run()
Ejemplo n.º 5
0
def mnest2tree(ei, ef, k, ni=None, nf=None, ev=None, no=None, eo=None):
    # paracheck追加
    efs = ef.split(",")
    ef1 = efs[0]
    ef2 = efs[1]

    f = nm.mcut(f="%s:#orgKey,%s:#orgEf1,%s:#orgEf2" % (k, ef1, ef2), i=ei)

    temp = mtemp.Mtemp()
    of = temp.file()

    with _nu.mcsvout(o=of, f="#orgKey,#orgEf1,#orgEf2,#ef1,#ef2") as oCSV:
        for flds in f:
            orgKey = flds[0]
            orgEf1 = flds[1]
            orgEf2 = flds[2]
            oCSV.write([orgKey, orgEf1, orgEf2, orgKey, orgEf1])
            oCSV.write([orgKey, orgEf1, orgEf2, orgKey, orgEf2])

    f = None
    f <<= nm.mjoin(k="#orgKey,#orgEf1,#orgEf2", K=[k, ef1, ef2], m=ei,
                   i=of)  # 全項目join
    if ev:
        f <<= nm.mavg(k="#ef1,#ef2", f=ev)
    else:
        f <<= nm.muniq(k="#ef1,#ef2")

    f <<= nm.mcut(r=True, f="#orgKey,#orgEf1,#orgEf2")
    f <<= nm.mfldname(f="#ef1:%s,#ef2:%s" % (ef1, ef2), o=eo)
    f.run()

    if ni:
        head = nu.mheader(i=ni)
        fldnames = [s for s in head if s != nf]
        commas = ',' * (len(fldnames) - 1)

        f0 = None
        f0 <<= nm.mcut(f="%s:%s" % (ef1, nf), i=eo)
        f0 <<= nm.muniq(k=nf)
        f0 <<= nm.mcommon(k=nf, m=ni, r=True)
        f0 <<= nm.msetstr(v=commas, a=fldnames)

        f = nm.mcut(f=k, r=True, i=[ni, f0])
        f <<= nm.msetstr(v="", a=k, o=no)
        f.run()
Ejemplo n.º 6
0
    def run(self, **kw_args):

        os.environ["KG_VerboseLevel"] = "2"
        if "msg" in kw_args:
            if kw_args["msg"] == "on":
                os.environ['KG_ScpVerboseLevel'] = "3"

        temp = Mtemp()
        xxedge = temp.file()
        xxnode = temp.file()
        xxnam2num = temp.file()
        xxnum2nam = temp.file()
        xxebase = temp.file()
        xxbody = temp.file()

        e1 = None
        if self.ew:
            e1 <<= nm.mcut(f="%s:__node1,%s:__node2,%s:__weight" %
                           (self.ef1, self.ef2, self.ew),
                           i=self.eFile)
        else:
            e1 <<= nm.mcut(f="%s:__node1,%s:__node2" % (self.ef1, self.ef2),
                           i=self.eFile)

        e1 <<= nm.muniq(k="__node1,__node2")

        e2 = nm.mfldname(i=e1, f="__node2:__node1,__node1:__node2")

        fe = None
        fe <<= nm.muniq(k="__node1,__node2", i=[e1, e2], o=xxedge)
        fe.run()

        # cleaning the node data (remove duplicate nodes)
        fn = None
        if self.nFile:
            if self.nw:
                fn <<= nm.mcut(f="%s:__node,%s" % (self.nf, self.nw),
                               i=self.nFile)
            else:
                fn <<= nm.mcut(f="%s:__node" % (self.nf), i=self.nFile)

            fn <<= nm.muniq(k="__node", o=xxnode)

        else:
            xxen1 = nm.mcut(f="__node1:__node", i=xxedge)
            xxen2 = nm.mcut(f="__node2:__node", i=xxedge)
            fn <<= nm.muniq(k="__node", o=xxnode, i=[xxen1, xxen2])

        fn.run()

        # 節点名<=>節点番号変換表の作成
        fmap = None
        fmap <<= nm.mcut(f="__node", i=xxnode)
        fmap <<= nm.mnumber(a="__num", S=1, q=True, o=xxnam2num)
        fmap <<= nm.msortf(f="__num", o=xxnum2nam)
        fmap.run()

        # 節点ファイルが指定された場合は枝ファイルとの整合性チェック
        if self.nFile:
            ncheck = nm.mcut(f="__node1:__node", i=xxedge)
            ncheck <<= nm.mcommon(k="__node", m=xxnam2num, r=True)
            nmatch = ncheck.run()
            if len(nmatch) > 0:
                raise Exception(
                    "#ERROR# the node named '%s' in the edge file doesn't exist in the node file."
                    % (nmatch[0][0]))

        # metisのグラフファイルフォーマット
        # 先頭行n m [fmt] [ncon]
        # n: 節点数、m:枝数、ncon: 節点weightの数
        # 1xx: 節点サイズ有り (not used, meaning always "0")
        # x1x: 節点weight有り
        # xx1: 枝がweightを有り
        # s w_1 w_2 ... w_ncon v_1 e_1 v_2 e_2 ... v_k e_k
        # s: 節点サイズ  (節点サイズは利用不可)
        # w_x: 節点weight
        # v_x: 接続のある節点番号(行番号)
        # e_x: 枝weight

        # --------------------
        # generate edge data using the integer numbered nodes
        #fnnum = None
        fnnum = nm.mcut(f="__num:__node_n1", i=xxnam2num)  # {xxnnum}

        fenum = None
        fenum <<= nm.mjoin(k="__node1",
                           K="__node",
                           f="__num:__node_n1",
                           m=xxnam2num,
                           i=xxedge)
        fenum <<= nm.mjoin(k="__node2",
                           K="__node",
                           f="__num:__node_n2",
                           m=xxnam2num)
        fenum <<= nm.msortf(f="__node_n1")  #{xxenum}

        febase = None
        febase <<= nm.mnjoin(k="__node_n1", m=fenum, i=fnnum, n=True)
        febase <<= nm.msortf(f="__node_n1%n,__node_n2%n",
                             o=xxebase)  #{xxebase}"
        febase.run()

        fbody = None
        if not self.ew:
            fbody <<= nm.mcut(f="__node_n1,__node_n2", i=xxebase)
            fbody <<= nm.mtra(k="__node_n1", f="__node_n2", q=True)
            fbody <<= nm.mcut(f="__node_n2", nfno=True, o=xxbody)

        # if ew= is specified, merge the weight data into the edge data.
        else:
            febody = None
            febody <<= nm.mcut(f="__node_n1,__node_n2:__v", i=xxebase)
            febody <<= nm.mnumber(S=0, I=2, a="__seq", q=True)

            fwbody = None
            fwbody <<= nm.mcut(f="__node_n1,__weight:__v", i=xxebase)
            fwbody <<= nm.mnumber(S=1, I=2, a="__seq", q=True)

            fbody <<= nm.msortf(f="__seq%n", i=[febody, fwbody])
            fbody <<= nm.mtra(k="__node_n1", f="__v", q=True)
            fbody <<= nm.mcut(f="__v", nfno=True, o=xxbody)

        fbody.run()
        # xxbody
        # 2 7 3 8 5 9
        # 1 7 3 10 5 11 7 12
        # 1 8 2 10 4 13 7 14

        # --------------------
        # generate node data using integer number
        if self.nFile and self.nw:
            # xxnode
            # __node,v1,v2
            # a,1,1
            # b,1,1
            # c,1,1
            xxnbody = temp.file()
            xxnbody1 = temp.file()
            fnbody = None
            fnbody <<= nm.mjoin(k="__node", f="__num", i=xxnode, m=xxnam2num)
            fnbody <<= nm.msortf(f="__num%n")
            fnbody <<= nm.mcut(f=self.nw, nfno=True)
            fnbody <<= nm.cmd("tr ',' ' ' ")  # tricky!!
            fnbody <<= nm.mwrite(o=xxnbody)
            fnbody.run()
            # xxnbody
            # 1 1
            # 1 1
            # 1 1
            # paste the node weight with edge body
            fnbody1 = None
            fnbody1 <<= nm.mpaste(nfn=True, m=xxbody, i=xxnbody)
            fnbody1 <<= nm.cmd("tr ',' ' ' ")
            fnbody1 <<= nm.mwrite(o=xxnbody1)
            fnbody1.run()
            os.system("mv %s %s" % (xxnbody1, xxbody))

        # xxbody
        # 1 1 2 7 3 8 5 9
        # 1 1 1 7 3 10 5 11 7 12
        # 1 1 1 8 2 10 4 13 7 14

        eSize = mrecount(i=xxedge)
        eSize /= 2
        nSize = mrecount(i=xxnode)
        nwFlag = 1 if self.nw else 0
        ewFlag = 1 if self.ew else 0

        fmt = "0%d%d" % (nwFlag, ewFlag)

        xxhead = temp.file()
        xxgraph = temp.file()

        os.system("echo '%d %d %s %d' > %s" %
                  (nSize, eSize, fmt, self.ncon, xxhead))
        os.system("cat  %s %s > %s" % (xxhead, xxbody, xxgraph))

        if self.mFile:
            nm.mfldname(f="__num:num,__node:node", i=xxnum2nam,
                        o=self.mFile).run()

        if self.dFile:
            os.system("cp %s %s" % (xxgraph, self.dFile))

        if not self.noexe:
            if self.verbose:
                os.system(
                    "gpmetis -seed=%d -ptype=%s -ncuts=%d -ufactor=%d %s %d" %
                    (self.seed, self.ptype, self.ncuts, self.ufactor, xxgraph,
                     self.kway))
            else:
                os.system(
                    "gpmetis -seed=%d -ptype=%s -ncuts=%d -ufactor=%d %s %d  > /dev/null"
                    % (self.seed, self.ptype, self.ncuts, self.ufactor,
                       xxgraph, self.kway))
            import glob
            if len(glob.glob(xxgraph + ".part.*")) == 0:
                raise Exception(
                    "#ERROR# command `gpmetis' didn't output any results")

            # 節点名を数字から元に戻す
            # #{xxgraph}.part.#{kway}
            # 1
            # 0
            # 1
            fo = None
            fo <<= nm.mcut(f="0:cluster",
                           nfni=True,
                           i=xxgraph + ".part." + str(self.kway))
            fo <<= nm.mnumber(S=1, a="__num", q=True)
            fo <<= nm.mjoin(k="__num", f="__node", m=xxnum2nam)
            fo <<= nm.msortf(f="__node,cluster")
            if self.nf:
                fo <<= nm.mcut(f="__node:%s,cluster" % (self.nf), o=self.oFile)
            else:
                fo <<= nm.mcut(f="__node:node,cluster", o=self.oFile)
            fo.run()

        nu.mmsg.endLog(self.__cmdline())
Ejemplo n.º 7
0
	def run(self):

		from datetime import datetime	
		t = datetime.now()

		temp=nu.Mtemp()
		xxsspcin=temp.file()
		xxmap=temp.file()

		# traファイルの変換とマップファイルの作成
		if self.num :
			total = self.convN(self.iFile,self.idFN,self.itemFN,xxsspcin,xxmap)
		else:
			total = self.conv(self.iFile,self.idFN,self.itemFN,xxsspcin,xxmap)

		# system "head xxsspcin"
		# 3 5 0 2
		# 4 1 2
		# 0 2 3 1
		# 1 0 2
		# 3 4 0 1
		# system "head xxmap"
		# ##item,##freq%0nr,##num
		# b,4,0
		# d,4,1
		# f,4,2
		minSupp = int(total*self.minSupPrb)	if self.minSupPrb else self.minSupCnt
			

		# sspc用simの文字列
		if self.sim :
			if self.sim=="J":
				sspcSim="R"
			elif self.sim=="P":
				sspcSim="P"
			elif self.sim=="C":
				sspcSim="i"
		# sim=省略時はRでth=0とする(sim制約なし)
		else:
			sspcSim="R"
			self.th=0

		############ 列挙本体 ############
		xxsspcout=temp.file()
		tpstr =  sspcSim+"ft_" if self.msgoff else sspcSim+"ft"
		extTake.sspc(type=tpstr,TT=minSupp,i=xxsspcin,th=self.th,o=xxsspcout)

		##################################

		xxtmmp=temp.file()
		
		f =   nm.mread(i=xxsspcout) 
		f <<= nm.cmd("tr ' ()' ','") 
		f <<= nm.mcut(f="1:i1,2:i2,0:frequency,4:sim",nfni=True)

		if self.num :

			f <<= nm.mfldname(f="i1:node1,i2:node2")
			if self.sim!="C":
				f <<= nm.mfsort(f="node1,node2")
			
			f <<= nm.mjoin(k="node1",K="##item",m=xxmap,f="##freq:frequency1")
			f <<= nm.mjoin(k="node2",K="##item",m=xxmap,f="##freq:frequency2") 
			
		else:

			f <<= nm.mjoin(k="i1",K="##num",m=xxmap,f="##item:node1,##freq:frequency1")
			f <<= nm.mjoin(k="i2",K="##num",m=xxmap,f="##item:node2,##freq:frequency2") 

			if self.sim!="C":

				f <<= nm.mcut(f="i1,i2,frequency,sim,node1,node2,frequency1,frequency2,node1:node1x,node2:node2x")
				f <<= nm.mfsort(f="node1x,node2x")
				f <<= nm.mcal(c='if($s{node1}==$s{node1x},$s{frequency1},$s{frequency2})',a="freq1")
				f <<= nm.mcal(c='if($s{node2}==$s{node2x},$s{frequency2},$s{frequency1})',a="freq2")
				f <<= nm.mcut(f="i1,i2,frequency,sim,node1x:node1,node2x:node2,freq1:frequency1,freq2:frequency2")

		f <<= nm.msetstr(v=total,a="total")
		f <<= nm.mcal(c='${frequency}/${frequency1}',a="confidence")
		f <<= nm.mcal(c='${frequency}/${total}',a="support")
		f <<= nm.mcal(c='${frequency}/(${frequency1}+${frequency2}-${frequency})',a="jaccard")
		f <<= nm.mcal(c='(${frequency}*${total})/((${frequency1}*${frequency2}))',a="lift")
		f <<= nm.mcal(c='(ln(${frequency})+ln(${total})-ln(${frequency1})-ln(${frequency2}))/(ln(${total})-ln(${frequency}))',a="PMI")
		f <<= nm.mcut(f="node1,node2,frequency,frequency1,frequency2,total,support,confidence,lift,jaccard,PMI")
		f <<= nm.msortf(f="node1,node2",o=self.oeFile)
		f.run()

		if self.onFile:
			f4 =   nm.mcut(f=self.itemFN+":node",i=self.iFile)
			f4 <<= nm.mcount(k="node",a="frequency")
			if self.node_support :
				minstr = "[%s,]"%(minSupp)
				f4 <<= nm.mselnum(f="frequency",c=minstr)

			f4 <<= nm.msetstr(v=total,a="total")
			f4 <<= nm.mcal(c='${frequency}/${total}',a="support")
			f4 <<= nm.mcut(f="node,support,frequency,total",o=self.onFile)
			f4.run()

		procTime=datetime.now()-t

		# ログファイル出力
		if self.logFile :
			kv=[["key","value"]]
			for k,v in self.args.items():
				kv.append([k,str(v)])
			kv.append(["time",str(procTime)])
			nm.writecsv(i=kv,o=self.logFile).run()
Ejemplo n.º 8
0
    def run(self, **kw_args):

        os.environ['KG_ScpVerboseLevel'] = "2"
        if "msg" in kw_args:
            if kw_args["msg"] == "on":
                os.environ['KG_ScpVerboseLevel'] = "4"

        ln = "#{@pt}line"

        # make the line number
        ln = "{}line".format(self.pt)

        xxmap = self.workf.file()
        sdata = self.workf.file()

        # convert the data for sketchport
        # mkdata
        xx1 = nm.mnumber(S=0, a=ln, q=True, i=self.iFile)
        if self.wfH:
            xx2 = nm.mcut(f=self.wfH + self.tidH + self.elem, i=xx1)
        else:
            self.wfH = ["{}wf".format(self.pt)]
            xx2 = nm.msetstr(v=0, a=self.wfH, i=xx1)
            xx2 <<= nm.mcut(f=self.wfH + self.tidH + self.elem)

        fmap = nm.mcut(f=[ln] + self.tidH, i=xx1, o=xxmap)
        xx2 <<= nm.mcut(f=self.wfH + self.elem, nfno=True)
        xx2 <<= nm.cmd("tr ',' ' '")
        xx2 <<= nm.mwrite(o=sdata)
        nm.runs([fmap, xx2])

        # do sort
        outf = self.workf.file()
        para = {}
        if self.dist == "C":
            para["cosdist"] = self.th
        elif self.dist == "H":
            para["hamdist"] = self.th

        if not self.uc:
            para["centering"] = True

        para["auto"] = True
        para["windowsize"] = self.ws
        para["seed"] = self.seed
        para["missingratio"] = self.mr
        para["i"] = sdata
        para["o"] = outf
        status = extMining.sketchsort(para)
        if status:
            raise Exception("#ERROR# checking sketchsort messages")
        tmp = []
        for val in self.tidH:
            tmp.append("{}:{}2".format(val, val))
        tid2 = ",".join(tmp)

        f = nm.mread(i=outf)
        f <<= nm.cmd("tr ' ' ',' ")
        f <<= nm.mcut(nfni=True, f="0:eline1,1:eline2,2:distance")
        f <<= nm.mfsort(f="eline*")
        # 行番号に対応するtidを取得
        f <<= nm.mjoin(k="eline1",
                       K="{}line".format(self.pt),
                       f=self.tidH,
                       m=xxmap)
        f <<= nm.mjoin(k="eline2", K="{}line".format(self.pt), f=tid2, m=xxmap)
        f <<= nm.msortf(f="eline1%n,eline2%n")
        f <<= nm.mcut(r=True, f="eline1,eline2")
        f <<= nm.msortf(f=self.tidH)
        f <<= nm.mfldname(q=True, o=self.oFile)
        f.run()
        nu.mmsg.endLog(self.__cmdline())