def subCluster(n,clustr,distMat):
    '''
    Takes a clustering csv (clustr) and distance matrix (distMat) as inputs
    For largest two clusters (by probability mass), builds reduced/sliced distMat 
    Breaks the largest two clusters into n subclusters via k-medoids
    '''
    import operator
    import sklearn
    from sklearn import metrics
    
    #get the unigram probs
    allChords = csv.reader(open('50ms 3 SDSets.csv','r',newline='\n'))
    uniProbs = {}
    for row in allChords:
        uniProbs[row[0]] = int(row[1])
    uniProbs = getProbsFromFreqs(uniProbs)
    
    #get the distance matrix
    diMat = []
    dists = csv.reader(open(distMat, 'r',newline='\n'))
    for row in dists:
        diMat.append(row)
    disArr = np.array(diMat)#pairwise dist mat (gen Manh?) as strings
    diArr = disArr.astype(float)#now as floats
    #print(diArr)
    
    #get the kludgy lookup list that relates chord labels to distMat rows
    lkps = {}
    lkp = csv.reader(open('ndistMat_lookups_rev.csv','r',newline='\n'))
    for row in lkp:
        lkps[row[1]] = row[0]
    
    #get the medoids and membership from prev clustering
    meds = {}#dict of medoids: each medoid keys a list of [chord, chord,...]
    medP = {}#dict of total unigram tallies keyed by medoid
    clsts = csv.reader(open(clustr, 'r',newline='\n'))
    i=0
    for row in clsts:
        i+=1
        if i < 3: continue
        if row[2] not in meds:
            meds[row[2]] = []
            medP[row[2]] = 0
        meds[row[2]].append(row[0])
        medP[row[2]] += int(row[1])
    #list of medoids sorted by descending unigram probability captured
    sorted_medP = sorted(medP.items(), key=operator.itemgetter(1), reverse=True)
    
    #take the two biggest clusters and generate a new intra-clus disMat
    ri = str(random.randint(0,5000))#silly rn for csv bookkeeping
    sils = []#list silhouettes of the two biggest clusters
    newclus = []#format: [origin chord, unigram prob, cluster assign, medoid, distance to medoid]
    for j in range(2):
        subcl_id = []#this will be a list of row indices for new_distMat
        subcl = meds[sorted_medP[j][0]]#all the chord names in med
        for chd in subcl:
            subcl_id.append(lkps[chd])#the numerical maps for those chords
        rows = np.array(subcl_id, dtype=np.intp)
        new_distMat = diArr[np.ix_(rows, rows)]#distance matrix for subcluster
        
        #kmedoids on the subcluster
        clus_and_med = cluster(new_distMat,n)
        new_meds = [subcl[m] for m in clus_and_med[1]]
        msil = sklearn.metrics.silhouette_score(new_distMat,clus_and_med[0],metric='precomputed')
        print(len(clus_and_med[0]),new_distMat.shape,msil)
        sils.append([ri+'_'+str(j),msil])
        #print(new_meds)
        for l,oc in enumerate(subcl):
            newclus.append([oc,uniProbs[oc],clus_and_med[0][l],subcl[clus_and_med[0][l]],new_distMat[l,clus_and_med[0][l]]])
    
    #now dump the new clusterings into a csv
    csvName = 'subClus/subcluster test'+ri+'.csv'
    file = open(csvName, 'w',newline='\n')
    lw = csv.writer(file)
    lw.writerow(['origin chord','uprob','cluster','medoid','distance'])
    for row in newclus:
        lw.writerow(row)
    file2 = open('subClus/subClus_silh.csv','a',newline='\n')
    lw2 = csv.writer(file2)
    for row in sils:
        lw2.writerow(row)
def metaCluster(fld,k):
    '''
    For a collection of (locally-convergent) clusterings in fld
    Take each origin chord and track its cluster IDs across clusterings
    Compare the resulting membership vectors and cluster by THEIR similarity
    Hamming distance (how many entries are not the same) seems most appropriate
    '''
    listing = os.listdir(fld)
    clusDict = {}#dict of cluster assignments across runs: clusDict[origin chord]=[assign 1, assign 2, ...]
    ocs = []#will be list of keys from clusDict
    clusMat = []#will be list of cluster assignments in order of ocs, also from clusDict
    
    #Pull chord names and probabilities for topN chords
    allChords = csv.reader(open('50ms 3 SDSets.csv','r',newline='\n'))
    uniProbs = {}
    for row in allChords:
        #Make a list of the topN most unigram-probable sds
        uniProbs[row[0]] = int(row[1])
    uniProbs = getProbsFromFreqs(uniProbs)
    
    for f in listing:
        #Get clustering data
        address = fld + f
        allOCs = csv.reader(open(address,'r',newline='\n'))
        lstOfOCs = []
        for row in allOCs:
            lstOfOCs.append(row)#should be topN of these
        
        #for each origin chord, append its assignment to the relevant clusDict entry list
        for j,oc in enumerate(lstOfOCs):
            if j < 2: continue#cut the two header rows
            if not oc[0] in clusDict: clusDict[oc[0]] = []#make one if there isn't one
            clusDict[oc[0]].append(oc[2])#stick the (int) cluster assignment in dict
            
    #build (stable, ordered) ocs and clusMat lists
    for key in clusDict.keys():
        ocs.append(key)#this tells us what the rows of clusMat refer to
        clusMat.append(clusDict[key])#this is what we'll cluster
    print('clusDict',clusDict)
    print('first row of oc list and first row of clusMat')
    print(ocs[0],clusMat[0])
    
    #now, calculate hamming distances between rows/ocs
    distMat = pdist(clusMat,metric='hamming')
    distMat_sq = squareform(distMat)#redundant, square
    print(distMat_sq)
    #kmedoids
    clus_and_med = cluster(distMat_sq,k)
    meds = [ocs[med] for med in clus_and_med[1]]
    clus = []#format: [origin chord, unigram prob, cluster assignment, medoid, distance from medoid]
    for l,oc in enumerate(ocs):
        clus.append([oc,uniProbs[oc],clus_and_med[0][l],ocs[clus_and_med[0][l]],distMat_sq[l,clus_and_med[0][l]]])
    
    #send out the csv
    csvName = 'metacluster test.csv'
    file = open(csvName, 'w',newline='\n')
    lw = csv.writer(file)
    lw.writerow(meds)
    lw.writerow(['origin chord','uprob','cluster','medoid','distance'])
    for row in clus:
        lw.writerow(row)