def dexmlize(self, clfilename=""): """ closeds already is [] ToDo: check support in xml file, read/write further params there (e.g. dataset params)... """ if not clfilename: clfilename = self.xmlfilename xmldoc = xml.etree.ElementTree.parse(clfilename) elemclos = xmldoc.find("closures") self.minsupp = self.nrtr for clo in elemclos.getchildren(): "handle a closed set" s = set() for itelem in clo.getchildren(): "to do: check they are items" it = itelem.get("value") s.add(it) spp = int(clo.get("support")) clos = set2node(s, spp) if spp < self.minsupp: self.minsupp = spp self.closeds.append(clos) self.card += 1
def findmingens(self,suppthr=-1): """ compute the minimal generators for each closure; nontrivial pairs conform Wild's iteration-free basis; algorithm is the same as mineRR for confidence 1; optional suppthr in [0,1] to impose an extra level of iceberg but this optional value is currently ignored because maybe minsupp actually much higher than mined at use the support found in closures file when other supports handled, remember to memorize computed ones """ if suppthr < 0: "ToDo: find out how to use integer division" sthr = int(self.scale*self.minsupp/self.nrtr) else: "ToDo: remove this assignment and do it the right way" sthr = int(self.scale*self.minsupp/self.nrtr) if len(self.mingens)>0: return self.mingens self.v.inimessg("Computing cuts for minimal generators...") nonants = self.setcuts(sthr,self.scale,False)[1] self.v.zero(250) self.v.messg("computing transversal antecedents...") for nod in self.closeds: "careful, assuming nodes ordered by size here - find all free sets" self.v.tick() self.mingens[nod] = [] for m in self._faces(nod,nonants[nod]).transv().hyedges: mm = set2node(m) mm.setsupp(nod.supp) mm.clos = nod self.mingens[nod].append(mm) self.v.messg("...done;") return self.mingens
def __init__(self, supp, datasetfilename, v=None, xmlinput=False, externalminer=True): """ support float in [0,1], see above for rest of parameters """ self.use_external_miner = externalminer self.read_from_XML_file = xmlinput if v == None: self.v = verbosity() elif v == False: self.v = verbosity(False) else: self.v = v self.U = set([]) self.scale = 100000 self.datasetfilename = datasetfilename self.supp_percent = self.topercent(supp) self.xmlfilename = "%s_cl%2.3fs.xml" % (datasetfilename, self.supp_percent) self.closeds = [] self.card = 0 self.mustsort = False self.v.inimessg("Initializing lattice") if datasetfilename == "": self.v.messg(" with just a bottom empty closure.") self.nrocc = 0 self.nrtr = 0 self.nrits = 0 self.supp_percent = 0.0 self.card = 0 self.maxsupp = 0 self.minsupp = 0 self.addempty(0) else: try: datasetfile = open(datasetfilename + ".txt") except IOError: self.v.errmessg("Could not open file " + datasetfilename + ".txt") exit(0) self.v.zero(2500) self.v.messg("from file " + datasetfilename + "... computing parameters...") self.nrocc = 0 self.nrtr = 0 self.U = set([]) self.transcns = defaultdict(set) self.occurncs = defaultdict(set) for line in datasetfile: self.v.tick() for el in line.strip().split(): if len(el) > 0: isempty = False self.nrocc += 1 self.U.add(el) self.transcns[self.nrtr].add(el) self.occurncs[el].add(self.nrtr) if not isempty: self.nrtr += 1 self.nrits = len(self.U) self.intsupp = floor( supp * self.nrtr) # support bound into absolute int value if supp == 0: "Borgelt's apriori might not work with support zero" self.supp_percent = 0.001 else: "there remains a scale issue to look at in the clfile name" self.supp_percent = self.topercent(supp) if self.read_from_XML_file: self.v.messg("...reading closures from XML file...") try: self.dexmlize(self.xmlfilename) self.v.messg(str(self.card) + " closures found.") return except IOError: self.v.messg( self.xmlfilename + " not found, falling back to mining process...") nbord = 0 if self.use_external_miner: "try using results of external apriori, or calling it" clfilename = "%s_cl%2.3fs.txt" % (datasetfilename, self.supp_percent) suchfiles = glob(datasetfilename + "_cl*s.txt") cmmnd = ( './apriori.exe -tc -l1 -u0 -v" /%%a" -s%2.3f %s ' % (self.supp_percent, datasetfilename + ".txt")) + clfilename if clfilename in suchfiles: "avoid calling apriori if closures file already available" self.v.messg("...reading closures from file " + clfilename + "...") elif system() == "Darwin": if not glob("aprioriD"): self.use_external_miner = False self.v.errmessg( "aprioriD not found, falling back on internal closure miner" ) else: cmmnd = ( './aprioriD -tc -l1 -u0 -v" /%%a" -s%2.3f %s ' % (self.supp_percent, datasetfilename + ".txt")) + clfilename call(cmmnd, shell=True) elif system() == "Linux": "ToDo: make this case work" ## cmmnd = ('./aprioriL32 -tc -l1 -u0 -v" /%%a" -s%2.3f %s ' % ## (self.supp_percent,datasetfilename)) + clfilename ## call(cmmnd,shell=True) self.v.errmessg("Platform " + system() + " not handled yet, sorry") self.use_external_miner = False elif system() == "Windows" or system() == "Microsoft": self.v.messg("platform appears to be " + system() + ";") self.v.messg("computing closures by: \n " + cmmnd + "\n") if not glob("apriori.exe"): self.use_external_miner = False self.v.errmessg( "apriori.exe not found, falling back on internal closure miner" ) else: call(cmmnd) elif system() == "CYGWIN_NT-5.1": self.v.messg("platform appears to be " + system() + ";") self.v.messg("computing closures by: \n " + cmmnd + "\n") if not glob("apriori.exe"): self.use_external_miner = False self.v.errmessg( "apriori.exe not found, falling back on internal closure miner" ) else: call(cmmnd, shell=True) elif system() == "CYGWIN_NT-6.1-WOW64": self.v.messg("platform appears to be " + system() + ";") self.v.messg("computing closures by: \n " + cmmnd + "\n") call(cmmnd, shell=True) else: "unhandled platform" self.v.errmessg("Platform " + system() + " not handled yet, sorry") self.use_external_miner = False if self.use_external_miner: "closures file in place, either was there or just computed" self.card = 0 self.maxsupp = 0 self.minsupp = self.nrtr + 1 self.v.zero(250) self.v.messg("...loading closures in...") for line in file(clfilename): """ ToDo: maybe the file has lower support than desired and we do not want all closures there """ self.v.tick() node = str2node(line) self.closeds.append(node) self.card += 1 if node.supp > self.maxsupp: self.maxsupp = node.supp if node.supp != 0 and node.supp < self.minsupp: self.minsupp = node.supp if not self.use_external_miner: """ use internal miner either as asked or because could not use external apriori """ self.maxsupp = 0 clos_singl = set([]) self.v.inimessg("Computing closures at support %3.2f%%;" % self.topercent(supp)) self.v.messg("singletons first...") for item in self.U: "initialize (min-)heap with closures of singletons" self.v.tick() supset = self.occurncs[item] supp = len(supset) if supp > self.maxsupp: self.maxsupp = supp if supp > self.intsupp: clos_singl.add( (self.nrtr - supp, frozenset(self.inters(supset)), frozenset(supset))) else: nbord += 1 cnt_clos_singl = len(clos_singl) self.v.messg( str(cnt_clos_singl) + " such closures; " + "computing larger closures...") pend_clos = list(clos_singl.copy()) heapify(pend_clos) self.minsupp = self.nrtr while pend_clos: "extract largest-support closure and find subsequent ones" cl = heappop(pend_clos) spp = self.nrtr - cl[0] if spp < self.minsupp: self.minsupp = spp node = set2node(cl[1], spp) self.closeds.append(node) self.U.update(node) self.card += 1 for ext in clos_singl: "try extending with freq closures of singletons" if not ext[1] <= cl[1]: self.v.tick() supportset = cl[2] & ext[2] spp = len(supportset) if spp <= self.intsupp: nbord += 1 else: next_clos = frozenset(self.inters(supportset)) if next_clos not in [ cc[1] for cc in pend_clos ]: heappush( pend_clos, (self.nrtr - len(supportset), next_clos, frozenset(supportset))) if self.maxsupp < self.nrtr: "no bottom itemset, common to all transactions - hence add emtpy" self.addempty(self.nrtr) else: self.v.messg("bottom closure is nonempty;") self.v.messg("...done.") if self.mustsort: self.v.messg("sorting...") self.closeds.sort() self.mustsort = False self.v.messg(str(self.card) + " closures found.") if nbord != 0: "This info only available if the local miner was used" self.v.messg("Additionally checked " + str(nbord) + " infrequent sets as negative border.") self.v.inimessg("The max support is " + str(self.maxsupp) + ";") self.v.messg("the effective absolute support threshold is " + str(self.minsupp) + (", equivalent to %2.3f" % (float(self.minsupp * 100) / self.nrtr)) + "% of " + str(self.nrtr) + " transactions.")