コード例 #1
0
def ModularityProfile(data,
                      COST=4,
                      Kmin=1,
                      Kmax=Kmax,
                      Klist=None,
                      edgeDict={},
                      simDict={},
                      bagSize=bagSize,
                      trials=trials,
                      trials_decay=trials_decay):
    '''
    Function for determining the optimal number of k in clara algorithm.
    Currently by means of modularity maximization over various numbers of k.
    Input: all Clara.py input + bag size per run, list of k's to go over (not
    obligatory range(1:Kmax)), number of trials per k instance.
    trials_decay indicates whether trials number subside with k. If true, it
    decreases evenly up to 3 for k==Kmax.
    Output: modularity distributions + errorplot saved on disk.
    '''
    # runs clara prespecified number of times on a range of k's
    if Klist == None:
        Klist = range(Kmin, Kmax + 1)
    if costType != "modularity":
        print "cost function is not modularity - cannot start"
        return (1)
    modMax = []
    mod_lists = []
    modStdev = []
    modMean = []
    for k in Klist:
        # setting up the number of attempts to infer about modularity distribution
        # given the number of clusters k
        if trials_decay: ktrials = int(trials * (1 - k / float(Kmax))) + 3
        else: ktrials = trials
        min_cost, best_choice, best_res, cost_list, isolates = clara(
            data,
            k,
            COST,
            simDictClara=simDict,
            affinities=edgeDict,
            bagSize=bagSize,
            claraLoopNum=ktrials,
            noIsolates=True,
            saveAllResults=False,
            acceleration=2)

        modMax.append(-min_cost)  # negative sign due to minimization in clara
        mod_lists.append(cost_list)
        num_cost_list = np.array(cost_list)
        modStdev.append((1. + 1. / ktrials)**0.5 * np.std(num_cost_list))
        print "multiple: ", modStdev[-1]
        print "np.std(num_cost_list): ", np.std(num_cost_list)
        print "isolates: ", isolates
        modMean.append(-np.mean(num_cost_list))  # again negative

    # picking the number of clusters maximizing modularity
    Kopt = 0
    for k in Klist[:-1]:
        if modMean[Klist.index(k)] >= modMean[Klist.index(k) + 1] - modStdev[
                Klist.index(k) + 1]:
            Kopt = k
            break
    if not Kopt:
        Kopt = Klist[len(Klist) - 1]

    # plotting and saving
    plt.errorbar(x=Klist, y=modMean, yerr=modStdev, fmt='-o')
    plt.plot(Klist, modMax, '-o')
    plt.plot(Klist.index(Kopt) + 1, modMean[Klist.index(Kopt)], 'ro')
    plt.savefig("kink.png")

    return Kopt, mod_lists, modStdev, modMean, modMax
コード例 #2
0
def SGJRIcores(data,
               COST=4,
               K=12,
               edgeDict={},
               simDict={},
               bagSize=bagSize,
               distDict={},
               trials=100,
               threshold=None,
               segments=None,
               loadedCores=False,
               filepathCores=None,
               dendroFormat='png',
               acceleration=0):
    '''
    Average Clara results from lots of trials, and for each pair
    of nodes compute proportion of times they appear in the same cluster.
    Then discard the edges for which the proportion is lower than prespecified
    threshold alpha. After all, hierarchical structure of connected
    components is obtained, along with stable cores, whose contents
    remained the same whatever the partition is.
    Input
    '''
    print "SGJRI COST: ", COST

    # if the number of resulting segments is not specified, it
    # will be equal to K in clara, which provides most sensible results:
    if segments == None:
        segments = K
        print "as you didn't specify the number of segments, it will equal K: ", segments

    if loadedCores:
        if filepathCores == None:
            print "Error: no filepath for loading cores specified"
            return 1
        commonCluster = pickle.load(open(filepathCores, 'r'))[1]
        clustered = commonCluster.shape[0]
        clustData = []
        for i in xrange(clustered):
            clustData.append(commonCluster[i][0]['A'])
    else:
        isolates, allResults = clara(data,
                                     COST=COST,
                                     k=K,
                                     simDictClara=simDict,
                                     affinities=edgeDict,
                                     bagSize=bagSize,
                                     claraLoopNum=trials,
                                     noIsolates=True,
                                     saveAllResults=True,
                                     acceleration=acceleration)[4:6]

        # now the data list is already pruned, so there are no need to prune it
        # again. the code is excessive
        clustered = len(data)  # - len(isolates)
        clustData = [i for i in data if i not in isolates]
        commonCluster = np.zeros(shape=(clustered, clustered),
                                 dtype=[('count', 'f8'), ('A', 'a100'),
                                        ('B', 'a100')])

        print "isolates: ", len(isolates)
        #print "data: ", len(data)
        print "clustered: ", clustered

        # determining node names for dendrogram labels
        for i in xrange(clustered):
            for j in xrange(clustered):  #xrange(i+1, clustered):
                commonCluster[i][j]['A'] = clustData[i]
                commonCluster[i][j]['B'] = clustData[j]

        print "co-appearance counting and writing it into numpy array..."
        # co-appearance counting and writing it into numpy array
        count = 0
        for res in xrange(len(allResults)):
            for clus in allResults[res]:
                clusContents = list(allResults[res][clus])
                for nodeA in allResults[res][clus]:
                    # reduce the loop more than twice:
                    clusContents.remove(nodeA)
                    Aid = clustData.index(nodeA)
                    for nodeB in clusContents:
                        Bid = clustData.index(nodeB)
                        # making matrix symmetric with zero diagonal
                        commonCluster[Aid][Bid]['count'] += 1. / trials
                        commonCluster[Bid][Aid]['count'] += 1. / trials
            count += 1
            if count % 10 == 0:
                print "clusterings processed: %d out of %d" % (count, trials)
    """
    # transforming the matrix of co-occurences into matrix of distances
    for i in xrange(clustered):
        for j in xrange(clustered): #xrange(i+1, clustered):
            if commonCluster[i][j]['count'] == 0.: commonCluster[i][j]['count'] = 20.
            else: commonCluster[i][j]['count'] = 1. / commonCluster[i][j]['count']
    """

    # some debug output
    print "shape: ", commonCluster.shape
    #print is_valid_dm(commonCluster)   # -- invalid
    #print is_valid_y(commonCluster)

    # now obtaining the tree structure
    t = time.time()
    print "deriving three structure..."
    treeStruct = linkage(commonCluster, method='ward')
    print "tree structure obtained in %f seconds" % (time.time() - t)

    # then visualise the tree (prune according to common sense)
    t = time.time()
    print "constructing a dendrogram..."
    if threshold == None:
        threshold = treeStruct[len(treeStruct) - segments, 2] + 0.01
    #print len(treeStruct[len(treeStruct)-segments:])

    magnify = (float(clustered) / 400.)**0.5
    if dendroFormat == 'png': leaf_font_size = 13
    if (dendroFormat == 'svg') or (dendroFormat == 'pdf'):
        leaf_font_size = 2 * (3**0.5) / magnify

    dendro = dendrogram(treeStruct,
                        labels=clustData,
                        leaf_font_size=leaf_font_size,
                        color_threshold=threshold)
    print "dendrogram constructed in %f seconds" % (time.time() - t)

    t = time.time()
    print "saving the dendrogram..."
    figure = plt.gcf()
    if dendroFormat == 'png':
        figure.set_size_inches(120 * magnify, 60 * magnify)
    elif dendroFormat == 'svg' or dendroFormat == 'pdf':
        figure.set_size_inches(20 * magnify, 10 * magnify)

    plt.savefig("dendrogram_" + Version + "." + dendroFormat)

    print "dendrogram saved in %f seconds" % (time.time() - t)

    # then construct an array with clustered domains
    threshold = treeStruct[len(treeStruct) - segments, 2] - 0.000001
    clustIDs = fcluster(treeStruct, threshold, 'distance')

    segmentDict = {}
    for i in xrange(segments):
        segmentDict[i] = [
            clustData[j] for j in range(len(clustData)) if clustIDs[j] == i + 1
        ]

    # trying to get by without distDict
    if distDict != {}:
        # searching the most central nodes within the clusters
        harmCentr = intra_cluster_centrality(clustData,
                                             distDict=distDict,
                                             medoids=segmentDict)

        # combine results batch into mongo-acceptable format
        date = datetime.now().strftime("%Y-%m-%d")
        clusters = []
        for i in harmCentr:
            clDict = {'number': i}
            clDict['domains'] = harmCentr[i]
            clusters.append(clDict)

        mongo = {'date': date, 'clusters': clusters}

    else:
        print "Warning: no harmCentr and mongo!"
        harmCentr = {}
        mongo = {}

    return clustData, commonCluster, treeStruct, segmentDict, harmCentr, mongo