def subCluster(n,clustr,distMat): ''' Takes a clustering csv (clustr) and distance matrix (distMat) as inputs For largest two clusters (by probability mass), builds reduced/sliced distMat Breaks the largest two clusters into n subclusters via k-medoids ''' import operator import sklearn from sklearn import metrics #get the unigram probs allChords = csv.reader(open('50ms 3 SDSets.csv','r',newline='\n')) uniProbs = {} for row in allChords: uniProbs[row[0]] = int(row[1]) uniProbs = getProbsFromFreqs(uniProbs) #get the distance matrix diMat = [] dists = csv.reader(open(distMat, 'r',newline='\n')) for row in dists: diMat.append(row) disArr = np.array(diMat)#pairwise dist mat (gen Manh?) as strings diArr = disArr.astype(float)#now as floats #print(diArr) #get the kludgy lookup list that relates chord labels to distMat rows lkps = {} lkp = csv.reader(open('ndistMat_lookups_rev.csv','r',newline='\n')) for row in lkp: lkps[row[1]] = row[0] #get the medoids and membership from prev clustering meds = {}#dict of medoids: each medoid keys a list of [chord, chord,...] medP = {}#dict of total unigram tallies keyed by medoid clsts = csv.reader(open(clustr, 'r',newline='\n')) i=0 for row in clsts: i+=1 if i < 3: continue if row[2] not in meds: meds[row[2]] = [] medP[row[2]] = 0 meds[row[2]].append(row[0]) medP[row[2]] += int(row[1]) #list of medoids sorted by descending unigram probability captured sorted_medP = sorted(medP.items(), key=operator.itemgetter(1), reverse=True) #take the two biggest clusters and generate a new intra-clus disMat ri = str(random.randint(0,5000))#silly rn for csv bookkeeping sils = []#list silhouettes of the two biggest clusters newclus = []#format: [origin chord, unigram prob, cluster assign, medoid, distance to medoid] for j in range(2): subcl_id = []#this will be a list of row indices for new_distMat subcl = meds[sorted_medP[j][0]]#all the chord names in med for chd in subcl: subcl_id.append(lkps[chd])#the numerical maps for those chords rows = np.array(subcl_id, dtype=np.intp) new_distMat = diArr[np.ix_(rows, rows)]#distance matrix for subcluster #kmedoids on the subcluster clus_and_med = cluster(new_distMat,n) new_meds = [subcl[m] for m in clus_and_med[1]] msil = sklearn.metrics.silhouette_score(new_distMat,clus_and_med[0],metric='precomputed') print(len(clus_and_med[0]),new_distMat.shape,msil) sils.append([ri+'_'+str(j),msil]) #print(new_meds) for l,oc in enumerate(subcl): newclus.append([oc,uniProbs[oc],clus_and_med[0][l],subcl[clus_and_med[0][l]],new_distMat[l,clus_and_med[0][l]]]) #now dump the new clusterings into a csv csvName = 'subClus/subcluster test'+ri+'.csv' file = open(csvName, 'w',newline='\n') lw = csv.writer(file) lw.writerow(['origin chord','uprob','cluster','medoid','distance']) for row in newclus: lw.writerow(row) file2 = open('subClus/subClus_silh.csv','a',newline='\n') lw2 = csv.writer(file2) for row in sils: lw2.writerow(row)
def metaCluster(fld,k): ''' For a collection of (locally-convergent) clusterings in fld Take each origin chord and track its cluster IDs across clusterings Compare the resulting membership vectors and cluster by THEIR similarity Hamming distance (how many entries are not the same) seems most appropriate ''' listing = os.listdir(fld) clusDict = {}#dict of cluster assignments across runs: clusDict[origin chord]=[assign 1, assign 2, ...] ocs = []#will be list of keys from clusDict clusMat = []#will be list of cluster assignments in order of ocs, also from clusDict #Pull chord names and probabilities for topN chords allChords = csv.reader(open('50ms 3 SDSets.csv','r',newline='\n')) uniProbs = {} for row in allChords: #Make a list of the topN most unigram-probable sds uniProbs[row[0]] = int(row[1]) uniProbs = getProbsFromFreqs(uniProbs) for f in listing: #Get clustering data address = fld + f allOCs = csv.reader(open(address,'r',newline='\n')) lstOfOCs = [] for row in allOCs: lstOfOCs.append(row)#should be topN of these #for each origin chord, append its assignment to the relevant clusDict entry list for j,oc in enumerate(lstOfOCs): if j < 2: continue#cut the two header rows if not oc[0] in clusDict: clusDict[oc[0]] = []#make one if there isn't one clusDict[oc[0]].append(oc[2])#stick the (int) cluster assignment in dict #build (stable, ordered) ocs and clusMat lists for key in clusDict.keys(): ocs.append(key)#this tells us what the rows of clusMat refer to clusMat.append(clusDict[key])#this is what we'll cluster print('clusDict',clusDict) print('first row of oc list and first row of clusMat') print(ocs[0],clusMat[0]) #now, calculate hamming distances between rows/ocs distMat = pdist(clusMat,metric='hamming') distMat_sq = squareform(distMat)#redundant, square print(distMat_sq) #kmedoids clus_and_med = cluster(distMat_sq,k) meds = [ocs[med] for med in clus_and_med[1]] clus = []#format: [origin chord, unigram prob, cluster assignment, medoid, distance from medoid] for l,oc in enumerate(ocs): clus.append([oc,uniProbs[oc],clus_and_med[0][l],ocs[clus_and_med[0][l]],distMat_sq[l,clus_and_med[0][l]]]) #send out the csv csvName = 'metacluster test.csv' file = open(csvName, 'w',newline='\n') lw = csv.writer(file) lw.writerow(meds) lw.writerow(['origin chord','uprob','cluster','medoid','distance']) for row in clus: lw.writerow(row)