Example #1
0
def _normalizeVectorLengths(M_lil):

    """
    Normalize the length of a sparse lil_matrix.
    """

    t1=time.time()

    # Create a norm-hash of each row-vector in the stemmed term-doc matrix.
    vectorLength=SearchTermDoc.createRLHash(M_lil, _RLHash,False)

    for row in range(1,M_lil.shape[0]):

        norm=vectorLength[row]
        for col in (M_lil.getrow(row).nonzero()[1])[1:]:
            M_lil[row,col]=(M_lil[row,col])/norm
        print "Normalized:",row
    t2=time.time()
    print "Total:"+str(t2-t1)

    # This is madness
    tfidfMatrix = M_lil

    # Save and overwrite the log_tfidf generated above
    IOmodule.writeOutTDM(_termDocDir, _tfidfName+'_norm', tfidfMatrix)
Example #2
0
def _normalizeVectorLengths(M_dense, M_lil, filename):

    """
    Normalize the length of a sparse matrix, represented as a dense and a lil -
    format.
    """

    vectorLength = SearchTermDoc.createRLHash(M_lil, None, False)

    for row in range(1, M_lil.shape[0]):

        norm = vectorLength[row]
        for col in (M_lil.getrow(row).nonzero()[1])[1:]:
            M_dense[row, col] = (M_dense[row, col]) / norm

    tfidfMatrix = sparse.coo_matrix(M_dense)

    # Save the matrix
    IOmodule.writeOutTDM(_termDocDir, filename, tfidfMatrix)