def countFrequency(matrixFname, rowFname, colFname): """ Read the matrix and compute row and column totals. If row and col files are given then we will read the ids from those files. We will then write the sorted frequencies to files with extensions .total. """ D = DMATRIX() print "Loading matrix,", matrixFname, "...", D.read_matrix(matrixFname) print "done" print "Loading row ids,", rowFname, "...", rows = loadIds(rowFname) print "done" print "Loading col ids,", colFname, "...", cols = loadIds(colFname) print "done" print "Getting row sums..." for rowid in rows: rows[rowid]['freq'] = D.get_row_sum(rowid) print "Getting column sums..." for colid in cols: cols[colid]['freq'] = D.get_col_sum(colid) rowItems = rows.items() colItems = cols.items() print "sorting rows...", rowItems.sort(sort_ids) print "done" print "sorting columns...", colItems.sort(sort_ids) print "done" saveIds("%s.total" % rowFname, rowItems) saveIds("%s.total" % colFname, colItems) pass
def load_matrix(matrix_fname): """ Read the data matrix. """ global M print "Loading matrix: %s" % matrix_fname M = DMATRIX() M.read_matrix(matrix_fname) return M
def save_transpose(mat_fname, trans_fname): """ Read, take the transpose the matrix and save it. """ M = DMATRIX() M.read_matrix(mat_fname) M.transpose() M.write_matrix(trans_fname) pass
def perform_sequential_coclustering(matrix_fname, clust_fname, theta, phi, verbose = True): """ Perform sequenctial clustering. """ M = DMATRIX(True) print "Reading data matrix...", M.read_matrix(matrix_fname) print "Done." clustAlgo = SEQCLUST() clustAlgo.VERBOSE = verbose (rowclusts,colclusts) = clustAlgo.coclustering(M, theta, phi) clustAlgo.write_coclusters(rowclusts, colclusts, theta, phi, clust_fname) sys.stderr.write("Clustering Finished....Terminating\n") pass
def centroidRank(clustFname, matrixFname, thesaurusFname, idFname=None, rows=False): """ Given a set of clusters (by default we assume column clusters), we will compute the cluster centroid of each cluster and rank the elements in a cluster in the descending order of their cosine similarity to the cluster centroid. We will write the sorted clusters to file thesaurusFname. If we are given names of each cluster in an idFname file, then instead of writing the cluster numbers we will write the names of columns to the thresholdFname. If rows is set to True we assume that clustFname represents row clusters. The matrix is given by the matrixFname. """ C = CLUSTERS() C.loadClusters(clustFname, IGNORE_SINGLETONS=True) C.showInfo() # compute the centroid for each cluster. M = DMATRIX() M.read_matrix(matrixFname) clusters = [] for clustID in C.getClustIDList(): centroid = VECTOR() n = 0 for element in C.getElementsByClusterID(clustID): if rows: v = M.get_row(element) else: v = M.get_column(element) centroid.add(v) n += 1 # take the mean. for k in v: centroid[k] = centroid[k]/float(n) # compute the similarity to the centroid. simScores = [] for element in C.getElementsByClusterID(clustID): if rows: v = M.get_row(element) else: v = M.get_column(element) sim = cosine_similarity(v, centroid) simScores.append((sim, element)) # sort the elements in a cluster. simScores.sort(elementSorter) clusters.append(simScores) pass # if the id files is given then write the ids. # otherwise just write the elements. thesaurus = open(thesaurusFname, "w") if idFname: names = loadElementNames(idFname) for clust in clusters: for (sim,ele) in clust: if idFname: name = names[ele] thesaurus.write("%s:%f\t" % (name,sim)) else: thesaurus.write("%s:%f\t" % (ele,sim)) thesaurus.write("\n") thesaurus.close() pass
def centroidRank(clustFname, matrixFname, thesaurusFname, idFname=None, rows=False): """ Given a set of clusters (by default we assume column clusters), we will compute the cluster centroid of each cluster and rank the elements in a cluster in the descending order of their cosine similarity to the cluster centroid. We will write the sorted clusters to file thesaurusFname. If we are given names of each cluster in an idFname file, then instead of writing the cluster numbers we will write the names of columns to the thresholdFname. If rows is set to True we assume that clustFname represents row clusters. The matrix is given by the matrixFname. """ C = CLUSTERS() C.loadClusters(clustFname, IGNORE_SINGLETONS=True) C.showInfo() # compute the centroid for each cluster. M = DMATRIX() M.read_matrix(matrixFname) clusters = [] for clustID in C.getClustIDList(): centroid = VECTOR() n = 0 for element in C.getElementsByClusterID(clustID): if rows: v = M.get_row(element) else: v = M.get_column(element) centroid.add(v) n += 1 # take the mean. for k in v: centroid[k] = centroid[k] / float(n) # compute the similarity to the centroid. simScores = [] for element in C.getElementsByClusterID(clustID): if rows: v = M.get_row(element) else: v = M.get_column(element) sim = cosine_similarity(v, centroid) simScores.append((sim, element)) # sort the elements in a cluster. simScores.sort(elementSorter) clusters.append(simScores) pass # if the id files is given then write the ids. # otherwise just write the elements. thesaurus = open(thesaurusFname, "w") if idFname: names = loadElementNames(idFname) for clust in clusters: for (sim, ele) in clust: if idFname: name = names[ele] thesaurus.write("%s:%f\t" % (name, sim)) else: thesaurus.write("%s:%f\t" % (ele, sim)) thesaurus.write("\n") thesaurus.close() pass