def ModularityProfile(data, COST=4, Kmin=1, Kmax=Kmax, Klist=None, edgeDict={}, simDict={}, bagSize=bagSize, trials=trials, trials_decay=trials_decay): ''' Function for determining the optimal number of k in clara algorithm. Currently by means of modularity maximization over various numbers of k. Input: all Clara.py input + bag size per run, list of k's to go over (not obligatory range(1:Kmax)), number of trials per k instance. trials_decay indicates whether trials number subside with k. If true, it decreases evenly up to 3 for k==Kmax. Output: modularity distributions + errorplot saved on disk. ''' # runs clara prespecified number of times on a range of k's if Klist == None: Klist = range(Kmin, Kmax + 1) if costType != "modularity": print "cost function is not modularity - cannot start" return (1) modMax = [] mod_lists = [] modStdev = [] modMean = [] for k in Klist: # setting up the number of attempts to infer about modularity distribution # given the number of clusters k if trials_decay: ktrials = int(trials * (1 - k / float(Kmax))) + 3 else: ktrials = trials min_cost, best_choice, best_res, cost_list, isolates = clara( data, k, COST, simDictClara=simDict, affinities=edgeDict, bagSize=bagSize, claraLoopNum=ktrials, noIsolates=True, saveAllResults=False, acceleration=2) modMax.append(-min_cost) # negative sign due to minimization in clara mod_lists.append(cost_list) num_cost_list = np.array(cost_list) modStdev.append((1. + 1. / ktrials)**0.5 * np.std(num_cost_list)) print "multiple: ", modStdev[-1] print "np.std(num_cost_list): ", np.std(num_cost_list) print "isolates: ", isolates modMean.append(-np.mean(num_cost_list)) # again negative # picking the number of clusters maximizing modularity Kopt = 0 for k in Klist[:-1]: if modMean[Klist.index(k)] >= modMean[Klist.index(k) + 1] - modStdev[ Klist.index(k) + 1]: Kopt = k break if not Kopt: Kopt = Klist[len(Klist) - 1] # plotting and saving plt.errorbar(x=Klist, y=modMean, yerr=modStdev, fmt='-o') plt.plot(Klist, modMax, '-o') plt.plot(Klist.index(Kopt) + 1, modMean[Klist.index(Kopt)], 'ro') plt.savefig("kink.png") return Kopt, mod_lists, modStdev, modMean, modMax
def SGJRIcores(data, COST=4, K=12, edgeDict={}, simDict={}, bagSize=bagSize, distDict={}, trials=100, threshold=None, segments=None, loadedCores=False, filepathCores=None, dendroFormat='png', acceleration=0): ''' Average Clara results from lots of trials, and for each pair of nodes compute proportion of times they appear in the same cluster. Then discard the edges for which the proportion is lower than prespecified threshold alpha. After all, hierarchical structure of connected components is obtained, along with stable cores, whose contents remained the same whatever the partition is. Input ''' print "SGJRI COST: ", COST # if the number of resulting segments is not specified, it # will be equal to K in clara, which provides most sensible results: if segments == None: segments = K print "as you didn't specify the number of segments, it will equal K: ", segments if loadedCores: if filepathCores == None: print "Error: no filepath for loading cores specified" return 1 commonCluster = pickle.load(open(filepathCores, 'r'))[1] clustered = commonCluster.shape[0] clustData = [] for i in xrange(clustered): clustData.append(commonCluster[i][0]['A']) else: isolates, allResults = clara(data, COST=COST, k=K, simDictClara=simDict, affinities=edgeDict, bagSize=bagSize, claraLoopNum=trials, noIsolates=True, saveAllResults=True, acceleration=acceleration)[4:6] # now the data list is already pruned, so there are no need to prune it # again. the code is excessive clustered = len(data) # - len(isolates) clustData = [i for i in data if i not in isolates] commonCluster = np.zeros(shape=(clustered, clustered), dtype=[('count', 'f8'), ('A', 'a100'), ('B', 'a100')]) print "isolates: ", len(isolates) #print "data: ", len(data) print "clustered: ", clustered # determining node names for dendrogram labels for i in xrange(clustered): for j in xrange(clustered): #xrange(i+1, clustered): commonCluster[i][j]['A'] = clustData[i] commonCluster[i][j]['B'] = clustData[j] print "co-appearance counting and writing it into numpy array..." # co-appearance counting and writing it into numpy array count = 0 for res in xrange(len(allResults)): for clus in allResults[res]: clusContents = list(allResults[res][clus]) for nodeA in allResults[res][clus]: # reduce the loop more than twice: clusContents.remove(nodeA) Aid = clustData.index(nodeA) for nodeB in clusContents: Bid = clustData.index(nodeB) # making matrix symmetric with zero diagonal commonCluster[Aid][Bid]['count'] += 1. / trials commonCluster[Bid][Aid]['count'] += 1. / trials count += 1 if count % 10 == 0: print "clusterings processed: %d out of %d" % (count, trials) """ # transforming the matrix of co-occurences into matrix of distances for i in xrange(clustered): for j in xrange(clustered): #xrange(i+1, clustered): if commonCluster[i][j]['count'] == 0.: commonCluster[i][j]['count'] = 20. else: commonCluster[i][j]['count'] = 1. / commonCluster[i][j]['count'] """ # some debug output print "shape: ", commonCluster.shape #print is_valid_dm(commonCluster) # -- invalid #print is_valid_y(commonCluster) # now obtaining the tree structure t = time.time() print "deriving three structure..." treeStruct = linkage(commonCluster, method='ward') print "tree structure obtained in %f seconds" % (time.time() - t) # then visualise the tree (prune according to common sense) t = time.time() print "constructing a dendrogram..." if threshold == None: threshold = treeStruct[len(treeStruct) - segments, 2] + 0.01 #print len(treeStruct[len(treeStruct)-segments:]) magnify = (float(clustered) / 400.)**0.5 if dendroFormat == 'png': leaf_font_size = 13 if (dendroFormat == 'svg') or (dendroFormat == 'pdf'): leaf_font_size = 2 * (3**0.5) / magnify dendro = dendrogram(treeStruct, labels=clustData, leaf_font_size=leaf_font_size, color_threshold=threshold) print "dendrogram constructed in %f seconds" % (time.time() - t) t = time.time() print "saving the dendrogram..." figure = plt.gcf() if dendroFormat == 'png': figure.set_size_inches(120 * magnify, 60 * magnify) elif dendroFormat == 'svg' or dendroFormat == 'pdf': figure.set_size_inches(20 * magnify, 10 * magnify) plt.savefig("dendrogram_" + Version + "." + dendroFormat) print "dendrogram saved in %f seconds" % (time.time() - t) # then construct an array with clustered domains threshold = treeStruct[len(treeStruct) - segments, 2] - 0.000001 clustIDs = fcluster(treeStruct, threshold, 'distance') segmentDict = {} for i in xrange(segments): segmentDict[i] = [ clustData[j] for j in range(len(clustData)) if clustIDs[j] == i + 1 ] # trying to get by without distDict if distDict != {}: # searching the most central nodes within the clusters harmCentr = intra_cluster_centrality(clustData, distDict=distDict, medoids=segmentDict) # combine results batch into mongo-acceptable format date = datetime.now().strftime("%Y-%m-%d") clusters = [] for i in harmCentr: clDict = {'number': i} clDict['domains'] = harmCentr[i] clusters.append(clDict) mongo = {'date': date, 'clusters': clusters} else: print "Warning: no harmCentr and mongo!" harmCentr = {} mongo = {} return clustData, commonCluster, treeStruct, segmentDict, harmCentr, mongo