def _normalizeVectorLengths(M_lil): """ Normalize the length of a sparse lil_matrix. """ t1=time.time() # Create a norm-hash of each row-vector in the stemmed term-doc matrix. vectorLength=SearchTermDoc.createRLHash(M_lil, _RLHash,False) for row in range(1,M_lil.shape[0]): norm=vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_lil[row,col]=(M_lil[row,col])/norm print "Normalized:",row t2=time.time() print "Total:"+str(t2-t1) # This is madness tfidfMatrix = M_lil # Save and overwrite the log_tfidf generated above IOmodule.writeOutTDM(_termDocDir, _tfidfName+'_norm', tfidfMatrix)
def _normalizeVectorLengths(M_dense, M_lil, filename): """ Normalize the length of a sparse matrix, represented as a dense and a lil - format. """ vectorLength = SearchTermDoc.createRLHash(M_lil, None, False) for row in range(1, M_lil.shape[0]): norm = vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_dense[row, col] = (M_dense[row, col]) / norm tfidfMatrix = sparse.coo_matrix(M_dense) # Save the matrix IOmodule.writeOutTDM(_termDocDir, filename, tfidfMatrix)