Exemple #1
0
def run(ngramPath, sid_seq, outPath):
    """
    this function is a simpler version of the main function
    it is a wrapper around runDiana
    intended to be used for rhcBootstrap (my testing script)
    :type ngramPath: str
          - the path to the computed pattern dataset
    :type sid_seq: Dict{int:Dict{str:int}}
          - for each user, record the pattern and corresponding occurence #
    :type outPath: str
          - the path to store the output and temporary file
    """
    global matrixCompTotal

    startTime = time.time()

    idxToSid = [x+1 for x in range(len(sid_seq))]

    idfMap = rhc.excludeFeatures(rhc.getIdf(sid_seq, idxToSid), [])

    matrix = calculateDistance.partialMatrix(
        idxToSid, idfMap, ngramPath, 'tmp_%sroot' % int(time.time()),
        outPath, True)

    print('[LOG]: first matrixTime %f' % (time.time() - startTime))
    matrixCompTotal += time.time() - startTime

    hc = HCClustering(
        matrix, sid_seq, outPath, [], idxToSid,
        sizeThreshold=0.05 * len(sid_seq), idfMap=idfMap)
    result = hc.runDiana()

    print('[STAT]: total clustering time %f' % (time.time() - startTime))
    return result
Exemple #2
0
 def __init__(self, matrix, sid_seq, outPath, exclusions, idxToSid,
              sizeThreshold, idfMap=None):
     self.matrix = matrix
     self.sizeThreshold = sizeThreshold
     self.maxDistance = 100
     self.sid_seq = sid_seq
     self.outPath = outPath
     self.exclusions = exclusions
     if not idxToSid:
         idxToSid = [x+1 for x in range(len(sid_seq))]
     self.idxToSid = idxToSid
     if not idfMap:
         idfMap = rhc.excludeFeatures(rhc.getIdf(sid_seq, idxToSid), exclusions)
     self.idfMap = idfMap
Exemple #3
0
    def runDiana(self):
        """
        Perform recursive hierarchical clustering
        """
        global matrixCompTotal, splitTotal, modularityTotal, diaTotal
        global excluTotal

        M = self.matrix
        self.modularityBasics()
        print('[LOG]: finished calculating modularityBasics')

        # child Cid => parent Cid
        clusterHi = []

        # record the evaluation metrics
        evalResults = {}

        cid = 1
        clusters = [(range(len(self.matrix)), self.maxDistance, cid)]

        # get a mapping from cid => list of row sums for all ids in cluster
        self.sumEntriesMap = {}
        self.sumEntriesMap[cid] = np.sum(self.matrix, axis=1, dtype=np.float64)

        while clusters[-1][1] and len(clusters[-1][0]) > self.sizeThreshold:
            parentCid = clusters[-1][2]
            # print('splitting %s\t%s' % (clusters[-1][1],clusters[-1][2]))
            clusterHi.append((parentCid, cid + 1, cid + 2))

            curTime = time.time()
            (clusterA, clusterB, sumEntryA, sumEntryB, sumAB) = \
                (self.splitCluster(clusters.pop()))
            splitTotal += time.time() - curTime

            curTime = time.time()
            cid += 1
            self.sumEntriesMap[cid] = sumEntryA
            clusters.append((clusterA, self.getDia(cid, clusterA), cid))
            # clusters.append((clusterA, np.mean(sumEntryA) / len(clusterA), cid))
            cid += 1
            self.sumEntriesMap[cid] = sumEntryB
            clusters.append((clusterB, self.getDia(cid, clusterB), cid))
            # clusters.append((clusterB, np.mean(sumEntryB) / len(clusterB), cid))
            diaTotal += time.time() - curTime

            curTime = time.time()
            clusters = \
                sorted(clusters, key=lambda x: (x[1], len(x[0]))
                       if len(x[0]) > self.sizeThreshold else (0, 0))
            if len(clusters) == 2:
                # if it is the first time to compute modularity
                evalResult = self.evaluateModularity(
                    (clusterA, clusterB), (sumEntryA, sumEntryB))
            else:
                # if it is based on the previous scores
                evalResult = evalResults[len(clusters) - 1] + \
                    self.evaluateModularityShift((clusterA, clusterB), sumAB)
            modularityTotal += time.time() - curTime

            # print(sorted([len(x[0]) for x in clusters], reverse = True))
            # print(len(clusters[-1][0]))
            evalResults[len(clusters)] = evalResult
            # print('cluster num is %d, modularity %f' % (len(clusters), evalResult))

        # print(evalResults)
        sweetSpot = rhc.getSweetSpot(evalResults, 5)
        sweetSpot = sorted(evalResults.keys(),
                           key=lambda x: abs(x - sweetSpot))[0]
        print('[LOG]: sweetSpot is %d, modularity %f' %
              (sweetSpot, evalResults[sweetSpot]))

        # merge the clusters to the point of sweet spot
        clusterMap = dict([(row[2], row) for row in clusters])
        cids = [(row[2]) for row in clusters]
        while(len(cids) > sweetSpot):
            (parentCid, childACid, childBCid) = clusterHi.pop()
            # dismeter doesn't matter, so put zero here
            clusterMap[parentCid] = \
                (clusterMap[childACid][0] + clusterMap[childBCid][0],
                 0, parentCid)
            cids.append(parentCid)
            cids.remove(childACid)
            cids.remove(childBCid)

        # reconstruct the cluster list after merging
        clusters = [(x[0], x[1], None, x[2]) for cid, x in clusterMap.items()
                    if cid in cids]

        # get the exclusion map according to the current clustering
        startTime = time.time()
        excludeMap, exclusionScoreMap, scoreMap = \
            rhc.getExclusionMap(clusters, self.sid_seq, self.idfMap,
                                self.idxToSid, [row[3] for row in clusters],
                                self.exclusions)
        excluTotal += time.time() - startTime

        # for each cluster, we start a new clustering
        results = []
        for cidx in range(len(clusters)):
            row = clusters[cidx]
            idxs = row[0]    # get the list of all node in clusters
            sids = sorted([self.idxToSid[nidx] for nidx in idxs])
            excludedFeatures = excludeMap[row[3]]
            excludedScores = exclusionScoreMap[row[3]]

            # if we want to continue cluster this subcluster
            if len(sids) > self.sizeThreshold:
                newExclusions = self.exclusions + excludedFeatures
                # remove sids where the vector have all zeros
                newExclusionSet = set(newExclusions)
                oldLen = len(sids)
                excludedSids = [sid for sid in sids if len(
                    set(self.sid_seq[sid].keys()) - newExclusionSet) == 0]
                sids = [sid for sid in sids if len(
                    set(self.sid_seq[sid].keys()) - newExclusionSet) > 0]
                # if the cluster size is too small after feature selection,
                # don't cluster it
                # or if the cluster diameter is 0
                if not len(sids) > self.sizeThreshold:
                    result = ('l', sids + excludedSids,
                              {'exclusions': excludedFeatures,
                               'exclusionsScore': excludedScores})
                else:
                    matrixStart = time.time()
                    matrix = calculateDistance.partialMatrix(
                        sids,
                        rhc.excludeFeatures(rhc.getIdf(sid_seq, sids),
                                            newExclusions),
                        ngramPath,
                        'tmp_%d' % row[3],
                        '%st%d_' % (self.outPath, row[-1]),
                        True)
                    matrixCompTotal += time.time() - matrixStart
                    # after the matrix is calculated, we need to handle a
                    # speacial case where all entries in the matrix is zero,
                    # besically means if the first row of the
                    # matrix adds up to zero
                    # if this is the case, do not split the cluster
                    if np.sum(matrix[0]) == 0:
                        result = ('l', sids + excludedSids,
                                  {'exclusions': excludedFeatures,
                                   'exclusionsScore': excludedScores})
                    else:
                        # now that we have a new distance matrix, go and
                        # do another round of clustering
                        result = HCClustering(
                            matrix,
                            sid_seq,
                            '%sp%d_' % (self.outPath, row[-1]),
                            newExclusions,
                            sids,
                            self.sizeThreshold).runDiana()
                        if len(results) > 2:
                            info = result[2]
                        else:
                            info = {}

                        # put the excluded sids back as a cluster
                        if (len(excludedSids) > 0):
                            result[1].append(('l', excludedSids,
                                              {'isExclude': True}))

                        info['exclusions'] = excludedFeatures
                        info['exclusionsScore'] = excludedScores
                        # base on the score map, calculate the gini coefficient
                        # score map format {cid:[(feature, score)]}
                        # info['gini'] = getGini([x[1] for x in scoreMap[row[3]]])
                        result = (result[0], result[1], info)
            else:
                result = ('l', sids,
                          {'exclusions': excludedFeatures,
                           'exclusionsScore': excludedScores})
            results.append(result)

        return(('t', results, {'sweetspot': evalResults[sweetSpot]}))