Example #1
0
def countFrequency(matrixFname, rowFname, colFname):
    """
    Read the matrix and compute row and column totals.
    If row and col files are given then we will read the ids from those files.
    We will then write the sorted frequencies to files with extensions .total.
    """
    D = DMATRIX()
    print "Loading matrix,", matrixFname, "...",
    D.read_matrix(matrixFname)
    print "done"
    print "Loading row ids,", rowFname, "...",
    rows = loadIds(rowFname)
    print "done"
    print "Loading col ids,", colFname, "...",
    cols = loadIds(colFname)
    print "done"
    print "Getting row sums..."
    for rowid in rows:
        rows[rowid]['freq'] = D.get_row_sum(rowid)
    print "Getting column sums..."
    for colid in cols:
        cols[colid]['freq'] = D.get_col_sum(colid)
    rowItems = rows.items()
    colItems = cols.items()
    print "sorting rows...",
    rowItems.sort(sort_ids)
    print "done"
    print "sorting columns...",
    colItems.sort(sort_ids)
    print "done"
    saveIds("%s.total" % rowFname, rowItems)
    saveIds("%s.total" % colFname, colItems)
    pass
Example #2
0
def load_matrix(matrix_fname):
    """
    Read the data matrix.
    """
    global M
    print "Loading matrix: %s" % matrix_fname
    M = DMATRIX()
    M.read_matrix(matrix_fname)
    return M
Example #3
0
def load_matrix(matrix_fname):
    """
    Read the data matrix.
    """
    global M
    print "Loading matrix: %s" % matrix_fname
    M = DMATRIX()
    M.read_matrix(matrix_fname)
    return M
Example #4
0
def perform_sequential_coclustering(matrix_fname, clust_fname, theta, phi,
                                    verbose = True):
    """
    Perform sequenctial clustering.
    """
    M = DMATRIX(True)
    print "Reading data matrix...",
    M.read_matrix(matrix_fname)
    print "Done."
    clustAlgo = SEQCLUST()
    clustAlgo.VERBOSE = verbose
    (rowclusts,colclusts) = clustAlgo.coclustering(M, theta, phi)
    clustAlgo.write_coclusters(rowclusts, colclusts, theta, phi, clust_fname)
    sys.stderr.write("Clustering Finished....Terminating\n")
    pass
Example #5
0
def save_transpose(mat_fname, trans_fname):
    """
    Read, take the transpose the matrix and save it.
    """
    M = DMATRIX()
    M.read_matrix(mat_fname)
    M.transpose()
    M.write_matrix(trans_fname)
    pass
Example #6
0
def save_transpose(mat_fname, trans_fname):
    """
    Read, take the transpose the matrix and save it.
    """
    M = DMATRIX()
    M.read_matrix(mat_fname)
    M.transpose()
    M.write_matrix(trans_fname)
    pass    
Example #7
0
def countFrequency(matrixFname, rowFname, colFname):
    """
    Read the matrix and compute row and column totals.
    If row and col files are given then we will read the ids from those files.
    We will then write the sorted frequencies to files with extensions .total.
    """
    D = DMATRIX()
    print "Loading matrix,", matrixFname, "...",
    D.read_matrix(matrixFname)
    print "done"
    print "Loading row ids,", rowFname, "...",
    rows = loadIds(rowFname)
    print "done"
    print "Loading col ids,", colFname, "...",
    cols = loadIds(colFname)
    print "done"
    print "Getting row sums..."
    for rowid in rows:
        rows[rowid]['freq'] = D.get_row_sum(rowid)
    print "Getting column sums..."
    for colid in cols:
        cols[colid]['freq'] = D.get_col_sum(colid)
    rowItems = rows.items()
    colItems = cols.items()
    print "sorting rows...",
    rowItems.sort(sort_ids)
    print "done"
    print "sorting columns...",
    colItems.sort(sort_ids)
    print "done"
    saveIds("%s.total" % rowFname, rowItems)
    saveIds("%s.total" % colFname, colItems)
    pass
Example #8
0
def centroidRank(clustFname,
                 matrixFname,
                 thesaurusFname,
                 idFname=None,
                 rows=False):
    """
    Given a set of clusters (by default we assume column clusters),
    we will compute the cluster centroid of each cluster and
    rank the elements in a cluster in the descending order of their
    cosine similarity to the cluster centroid. We will write the sorted
    clusters to file thesaurusFname. If we are given names of each
    cluster in an idFname file, then instead of writing the cluster
    numbers we will write the names of columns to the thresholdFname.
    If rows is set to True we assume that clustFname represents
    row clusters. The matrix is given by the matrixFname.
    """
    C = CLUSTERS()
    C.loadClusters(clustFname, IGNORE_SINGLETONS=True)
    C.showInfo()
    # compute the centroid for each cluster.
    M = DMATRIX()
    M.read_matrix(matrixFname)
    clusters = []
    for clustID in C.getClustIDList():
        centroid = VECTOR()
        n = 0
        for element in C.getElementsByClusterID(clustID):
            if rows:
                v = M.get_row(element)
            else:
                v = M.get_column(element)
            centroid.add(v)
            n += 1
        # take the mean.
        for k in v:
            centroid[k] = centroid[k]/float(n)
        # compute the similarity to the centroid.
        simScores = []
        for element in C.getElementsByClusterID(clustID):
            if rows:
                v = M.get_row(element)
            else:
                v = M.get_column(element)
            sim = cosine_similarity(v, centroid)
            simScores.append((sim, element))
        # sort the elements in a cluster.
        simScores.sort(elementSorter)
        clusters.append(simScores)
        pass
    # if the id files is given then write the ids.
    # otherwise just write the elements.
    thesaurus = open(thesaurusFname, "w")
    if idFname:
        names = loadElementNames(idFname)
    for clust in clusters:
        for (sim,ele) in clust:
            if idFname:
                name = names[ele]
                thesaurus.write("%s:%f\t" % (name,sim))
            else:
                thesaurus.write("%s:%f\t" % (ele,sim))
        thesaurus.write("\n")
    thesaurus.close()
    pass
Example #9
0
def centroidRank(clustFname,
                 matrixFname,
                 thesaurusFname,
                 idFname=None,
                 rows=False):
    """
    Given a set of clusters (by default we assume column clusters),
    we will compute the cluster centroid of each cluster and
    rank the elements in a cluster in the descending order of their
    cosine similarity to the cluster centroid. We will write the sorted
    clusters to file thesaurusFname. If we are given names of each
    cluster in an idFname file, then instead of writing the cluster
    numbers we will write the names of columns to the thresholdFname.
    If rows is set to True we assume that clustFname represents
    row clusters. The matrix is given by the matrixFname.
    """
    C = CLUSTERS()
    C.loadClusters(clustFname, IGNORE_SINGLETONS=True)
    C.showInfo()
    # compute the centroid for each cluster.
    M = DMATRIX()
    M.read_matrix(matrixFname)
    clusters = []
    for clustID in C.getClustIDList():
        centroid = VECTOR()
        n = 0
        for element in C.getElementsByClusterID(clustID):
            if rows:
                v = M.get_row(element)
            else:
                v = M.get_column(element)
            centroid.add(v)
            n += 1
        # take the mean.
        for k in v:
            centroid[k] = centroid[k] / float(n)
        # compute the similarity to the centroid.
        simScores = []
        for element in C.getElementsByClusterID(clustID):
            if rows:
                v = M.get_row(element)
            else:
                v = M.get_column(element)
            sim = cosine_similarity(v, centroid)
            simScores.append((sim, element))
        # sort the elements in a cluster.
        simScores.sort(elementSorter)
        clusters.append(simScores)
        pass
    # if the id files is given then write the ids.
    # otherwise just write the elements.
    thesaurus = open(thesaurusFname, "w")
    if idFname:
        names = loadElementNames(idFname)
    for clust in clusters:
        for (sim, ele) in clust:
            if idFname:
                name = names[ele]
                thesaurus.write("%s:%f\t" % (name, sim))
            else:
                thesaurus.write("%s:%f\t" % (ele, sim))
        thesaurus.write("\n")
    thesaurus.close()
    pass