Ejemplo n.º 1
0
 def addempty(self, nrtr):
     """
     add emptyset as closure, with nrtr as support
     (pushed into the front, not appended)
     """
     node = slanode("", nrtr)
     self.card += 1
     self.closeds.insert(0, node)
Ejemplo n.º 2
0
 def close_old(self,st):
     '''
     closure of set st according to current closures list.
     CAREFUL: what if st is not included in self.U?
     '''
     sol = slanode(self.U, self.nrtr)
     for e in self.closeds:
         if st <= e < sol:
             sol = e
     return sol
Ejemplo n.º 3
0
 def close(self,st):
     '''
     closure of set st according to current minimal predecessors.
     We use the fact that the close sets are ordered.
     CAREFUL: what if st is not included in self.U?
     '''
     sol = slanode(self.U, self.nrtr)
     while st <= sol:
         for e in self.preds_min[sol]:
             if st <= e:
                 sol = e
                 break
         else:
             break
     return sol
Ejemplo n.º 4
0
def post_init(self):
    '''
    This function is an auxiliary function to configure an instance
    of closminer.

    '''
    self.read_from_XML_file = self.xmlinput
    self.xmlfilename = "%s_cl%2.3fs.xml" % (self.datasetfilename,
                                            self.supp_percent)
    self.U = set()
    self.closeds = list()
    if self.v == None:
        self.v = verbosity()
    self.v.inimessg("Initializing lattice")
    if not self.datasetfilename:
        '''
        if not a filename is given.
        '''
        self.v.messg(" with just a bottom empty closure.")
        self.nrocc = 0
        self.nrits = 0
        self.supp_percent = 0.0
        self.card = 0
        self.maxsupp = 0
        self.minsupp = 0
        self.addempty(0)
    else:
        try:
            datasetfile = open(self.datasetfilename + ".txt")
        except IOError:
            self.v.errmessg("Could not open file %s.txt" %
                            (self.datasetfilename))
            exit(0)
        self.v.zero(2500)
        self.v.messg("from file %s... computing parameters..." %
                     (self.datasetfilename))
        self.nrocc = 0
        self.U = set([])
        self.transcns = defaultdict(set)
        self.occurncs = defaultdict(set)
        for line in datasetfile:
            self.v.tick()
            for el in line.strip().split():
                if el:
                    isempty = False
                    self.nrocc += 1
                    self.U.add(el)
                    self.transcns[self.nrtr].add(el)
                    self.occurncs[el].add(self.nrtr)
            if not isempty:
                self.nrtr += 1
        self.nrits = len(self.U)  # number of items
        self.intsupp = floor(
            self.supp * self.nrtr)  # support bound into absolute int value
        if not self.supp:
            "Borgelt's apriori might not work with support zero"
            self.supp_percent = 0.001
        else:
            "there remains a scale issue to look at in the clfile name"
            self.supp_percent = 100.0 * floor(
                self.scale * self.supp) / self.scale
        if self.read_from_XML_file:
            self.v.messg("...reading closures from XML file...")
            try:
                self.dexmlize(self.xmlfilename)
                self.v.messg(str(self.card) + " closures found.")
                return
            except IOError:
                self.v.messg(self.xmlfilename +
                             " not found, falling back to mining process...")
        nbord = 0
        if self.use_external_miner:
            "try using results of external apriori, or calling it"
            clfilename = "%s_cl%2.3fs.txt" % (self.datasetfilename,
                                              self.supp_percent)
            suchfiles = glob(self.datasetfilename + "_cl*s.txt")
            if clfilename in suchfiles:
                "avoid calling apriori if closures file already available"
                self.v.messg("...reading closures from file " + clfilename +
                             "...")

            elif system() in COMMANDS:
                exe, cmmnd = COMMANDS[system()]
                self.v.messg("platform appears to be " + system() + ";")
                self.v.messg("computing closures by: \n\t%s\n" % (cmmnd))
                if not glob(exe):
                    self.use_external_miner = False
                    self.v.errmessg(
                        "%s not found, falling back on internal closure miner"
                        % (exe))
                else:
                    cmmnd = cmmnd % (self.supp_percent,
                                     self.datasetfilename + ".txt", clfilename)
                    call(cmmnd, shell=True)
            else:
                "unhandled platform"
                self.v.errmessg("Platform " + system() +
                                " not handled yet, sorry")
                self.use_external_miner = False
            if self.use_external_miner:
                "closures file in place, either was there or just computed"
                self.card = 0
                self.maxsupp = 0
                self.minsupp = self.nrtr + 1
                self.v.zero(250)
                self.v.messg("...loading closures in...")
                for line in open(clfilename, 'r').readlines():
                    """
                    ToDo: maybe the file has lower support
                    than desired and we do not want all closures there
                    """
                    self.v.tick()
                    node = slanode(line)
                    self.closeds.append(node)
                    self.card += 1
                    if node.supp > self.maxsupp:
                        self.maxsupp = node.supp
                    if 0 < node.supp < self.minsupp:
                        self.minsupp = node.supp
        if not self.use_external_miner:
            """
            use internal miner either as asked or
            because could not use external apriori
            """
            self.maxsupp = 0
            clos_singl = set([])
            self.v.inimessg(
                "Computing closures at support %3.2f%%;" %
                (100.0 * floor(self.scale * self.supp) / self.scale))
            self.v.messg("singletons first...")
            for item in self.U:
                "initialize (min-)heap with closures of singletons"
                self.v.tick()
                supset = self.occurncs[item]
                supp = len(supset)
                if supp > self.maxsupp:
                    self.maxsupp = supp
                if supp > self.intsupp:
                    clos_singl.add(
                        (self.nrtr - supp, frozenset(self.inters(supset)),
                         frozenset(supset)))
                else:
                    nbord += 1
            cnt_clos_singl = len(clos_singl)
            self.v.messg(
                str(cnt_clos_singl) + " such closures; " +
                "computing larger closures...")
            pend_clos = list(clos_singl.copy())
            heapify(pend_clos)
            self.minsupp = self.nrtr
            while pend_clos:
                "extract largest-support closure and find subsequent ones"
                cl = heappop(pend_clos)
                spp = self.nrtr - cl[0]
                if spp < self.minsupp:
                    self.minsupp = spp
                node = slanode(cl[1], spp)
                self.closeds.append(node)
                self.U.update(node)
                self.card += 1
                for ext in clos_singl:
                    "try extending with freq closures of singletons"
                    if not ext[1] <= cl[1]:
                        self.v.tick()
                        supportset = cl[2] & ext[2]
                        spp = len(supportset)
                        if spp <= self.intsupp:
                            nbord += 1
                        else:
                            next_clos = frozenset(self.inters(supportset))
                            if next_clos not in [cc[1] for cc in pend_clos]:
                                heappush(pend_clos,
                                         (self.nrtr - len(supportset),
                                          next_clos, frozenset(supportset)))
    if self.maxsupp < self.nrtr:
        "no bottom itemset, common to all transactions - hence add emtpy"
        self.addempty(self.nrtr)
    else:
        self.v.messg("bottom closure is nonempty;")
    self.v.messg("...done.")

    if self.mustsort:
        self.v.messg("sorting...")
        self.closeds.sort()
        self.mustsort = False

    self.v.messg(str(self.card) + " closures found.")
    if nbord:
        "This info only available if the local miner was used"
        self.v.messg("Additionally checked " + str(nbord) +
                     " infrequent sets as negative border.")
    self.v.inimessg("The max support is " + str(self.maxsupp) + ";")
    self.v.messg("the effective absolute support threshold is " +
                 str(self.minsupp) +
                 (", equivalent to %2.3f" %
                  (float(self.minsupp * 100) / self.nrtr)) + "% of " +
                 str(self.nrtr) + " transactions.")