def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" r, c = matches.nonzero() matches_list = pd.DataFrame({'master_side': r.astype(np.int64), 'dupe_side': c.astype(np.int64), 'similarity': matches.data}) return matches_list
def compute_information_gain(vectorizer: CountVectorizer, word: str, dataTrain: csr_matrix, targetTrain: [int]) \ -> float: """Compute information gain of given word and return value""" word = word.lower() parentEntropy = computeEntropy(targetTrain) numRows = dataTrain.get_shape()[0] wordYesSplit = {0: 0, 1: 0} wordNoSplit = {0: 0, 1: 0} for count in range(numRows): simpleSentence = vectorizer.inverse_transform(dataTrain[count])[0] if word in simpleSentence: wordYesSplit[targetTrain[count]] += 1 else: wordNoSplit[targetTrain[count]] += 1 wordYesArray = wordYesSplit[0] * [0] + wordYesSplit[1] * [1] #print("lenYesArr: {}, YesDict: {}".format(len(wordYesArray), wordYesSplit)) wordNoArray = wordNoSplit[0] * [0] + wordNoSplit[1] * [1] #print("lenNoArr: {}, NoDict: {}".format(len(wordNoArray), wordNoSplit)) yesSplitEntropy = computeEntropy(wordYesArray) noSplitEntropy = computeEntropy(wordNoArray) probYes = len(wordYesArray) / numRows probNo = len(wordNoArray) / numRows #print("parEnt: {}, YesEnt: {}, NoEnt: {}".format(parentEntropy, yesSplitEntropy, noSplitEntropy)) #print("probYes= {}, probNo= {}".format(probYes, probNo)) return parentEntropy - (yesSplitEntropy * probYes + noSplitEntropy * probNo)
def initialize_memmap_object(self, matrix_object: csr_matrix, path_memmap_object: str) -> memmap: fp = memmap(path_memmap_object, dtype='float64', mode='w+', shape=matrix_object.shape) fp[:] = matrix_object.todense()[:] return fp
def _symmetrize_matrix_and_fix_diagonal( x_non_symmetric: csr_matrix) -> csr_matrix: x_symmetric = x_non_symmetric.tolil() r, c = x_symmetric.nonzero() x_symmetric[c, r] = x_symmetric[r, c] r = np.arange(x_symmetric.shape[0]) x_symmetric[r, r] = 1 return x_symmetric.tocsr()
def calc_score(y_true: csr_matrix, y_pred: csr_matrix): """Score predictions in csr format""" # Hits are when we guessed a track that appeared # This is an efficient operation when both entries are csr_matrix instances hits = y_true.multiply(y_pred) # Sum the hits along the rows; results will be an nx1 array of scores, one for each training point scores = np.sum(hits, axis=1) # Return the scores as a flattened numpy array return scores.A.flatten()
def squarify(triangular_matrix: csr_matrix) -> csr_matrix: """Mirror a triangular matrix at the diagonal to make it a square matrix. The input matrix *must* be upper triangular to begin with, otherwise the results will be incorrect. No guard rails! """ assert (triangular_matrix.shape[0] == triangular_matrix.shape[1] ), "needs to be square matrix" # The matrix is already upper diagonal. Use the transpose method, see # https://stackoverflow.com/a/58806735/2340703. return (triangular_matrix + triangular_matrix.T - scipy.sparse.diags(triangular_matrix.diagonal()))
def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix: """Builds the cossine similarity matrix of two csr matrices""" tf_idf_matrix_1 = master_matrix tf_idf_matrix_2 = duplicate_matrix.transpose() optional_kwargs = dict() if self._config.number_of_processes > 1: optional_kwargs = { 'use_threads': True, 'n_jobs': self._config.number_of_processes } return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2, self._config.max_n_matches, self._config.min_similarity, **optional_kwargs)
def make_non_zero_information( self, weight_csr_matrix: csr_matrix) -> List[ROW_COL_VAL]: """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple. :param weight_csr_matrix: :return: """ assert isinstance(weight_csr_matrix, (csr_matrix, ndarray)) row_col_index_array = weight_csr_matrix.nonzero() row_indexes = row_col_index_array[0] column_indexes = row_col_index_array[1] assert len(row_indexes) == len(column_indexes) value_index_items = [None] * len( row_indexes) # type: List[ROW_COL_VAL] for i in range(0, len(row_indexes)): value_index_items[i] = ROW_COL_VAL( row_indexes[i], column_indexes[i], self.__get_value_index(row_indexes[i], column_indexes[i], weight_csr_matrix)) return value_index_items
def make_non_zero_information(weight_csr_matrix:csr_matrix): """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple. :param weight_csr_matrix: :return: """ assert isinstance(weight_csr_matrix, csr_matrix) row_col_index_array = weight_csr_matrix.nonzero() row_indexes = row_col_index_array[0] column_indexes = row_col_index_array[1] assert len(row_indexes) == len(column_indexes) value_index_items = [ ROW_COL_VAL( row_indexes[i], column_indexes[i], __get_value_index(row_indexes[i], column_indexes[i], weight_csr_matrix) ) for i in range(0, len(row_indexes))] return value_index_items
def initialize_memmap_object(self, matrix_object:csr_matrix, path_memmap_object:str)->memmap: fp = memmap(path_memmap_object, dtype='float64', mode='w+', shape=matrix_object.shape) fp[:] = matrix_object.todense()[:] return fp
def matrix(self, matrix: csr_matrix): if matrix.size: self.__matrix = matrix.todense()
def matrix_vector_product(matrix: csr_matrix, vector: np.ndarray) -> np.ndarray: return matrix.dot(vector)
def transform(self, X: csr_matrix) -> coo_matrix: return X.multiply(self.r)