def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
     """Returns a list of all the indices of matches"""
     r, c = matches.nonzero()
     matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
                                  'dupe_side': c.astype(np.int64),
                                  'similarity': matches.data})
     return matches_list
Ejemplo n.º 2
0
def compute_information_gain(vectorizer: CountVectorizer, word: str, dataTrain: csr_matrix, targetTrain: [int]) \
        -> float:
    """Compute information gain of given word and return value"""
    word = word.lower()
    parentEntropy = computeEntropy(targetTrain)
    numRows = dataTrain.get_shape()[0]
    wordYesSplit = {0: 0, 1: 0}
    wordNoSplit = {0: 0, 1: 0}
    for count in range(numRows):
        simpleSentence = vectorizer.inverse_transform(dataTrain[count])[0]
        if word in simpleSentence:
            wordYesSplit[targetTrain[count]] += 1
        else:
            wordNoSplit[targetTrain[count]] += 1
    wordYesArray = wordYesSplit[0] * [0] + wordYesSplit[1] * [1]
    #print("lenYesArr: {}, YesDict: {}".format(len(wordYesArray), wordYesSplit))
    wordNoArray = wordNoSplit[0] * [0] + wordNoSplit[1] * [1]
    #print("lenNoArr: {}, NoDict: {}".format(len(wordNoArray), wordNoSplit))
    yesSplitEntropy = computeEntropy(wordYesArray)
    noSplitEntropy = computeEntropy(wordNoArray)
    probYes = len(wordYesArray) / numRows
    probNo = len(wordNoArray) / numRows
    #print("parEnt: {}, YesEnt: {}, NoEnt: {}".format(parentEntropy, yesSplitEntropy, noSplitEntropy))
    #print("probYes= {}, probNo= {}".format(probYes, probNo))

    return parentEntropy - (yesSplitEntropy * probYes +
                            noSplitEntropy * probNo)
Ejemplo n.º 3
0
 def initialize_memmap_object(self, matrix_object: csr_matrix,
                              path_memmap_object: str) -> memmap:
     fp = memmap(path_memmap_object,
                 dtype='float64',
                 mode='w+',
                 shape=matrix_object.shape)
     fp[:] = matrix_object.todense()[:]
     return fp
Ejemplo n.º 4
0
 def _symmetrize_matrix_and_fix_diagonal(
         x_non_symmetric: csr_matrix) -> csr_matrix:
     x_symmetric = x_non_symmetric.tolil()
     r, c = x_symmetric.nonzero()
     x_symmetric[c, r] = x_symmetric[r, c]
     r = np.arange(x_symmetric.shape[0])
     x_symmetric[r, r] = 1
     return x_symmetric.tocsr()
Ejemplo n.º 5
0
def calc_score(y_true: csr_matrix, y_pred: csr_matrix):
    """Score predictions in csr format"""
    # Hits are when we guessed a track that appeared
    # This is an efficient operation when both entries are csr_matrix instances
    hits = y_true.multiply(y_pred)
    # Sum the hits along the rows; results will be an nx1 array of scores, one for each training point
    scores = np.sum(hits, axis=1)
    # Return the scores as a flattened numpy array
    return scores.A.flatten()
Ejemplo n.º 6
0
    def squarify(triangular_matrix: csr_matrix) -> csr_matrix:
        """Mirror a triangular matrix at the diagonal to make it a square matrix.

        The input matrix *must* be upper triangular to begin with, otherwise
        the results will be incorrect. No guard rails!
        """
        assert (triangular_matrix.shape[0] == triangular_matrix.shape[1]
                ), "needs to be square matrix"
        # The matrix is already upper diagonal. Use the transpose method, see
        # https://stackoverflow.com/a/58806735/2340703.
        return (triangular_matrix + triangular_matrix.T -
                scipy.sparse.diags(triangular_matrix.diagonal()))
Ejemplo n.º 7
0
    def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix:
        """Builds the cossine similarity matrix of two csr matrices"""
        tf_idf_matrix_1 = master_matrix
        tf_idf_matrix_2 = duplicate_matrix.transpose()

        optional_kwargs = dict()
        if self._config.number_of_processes > 1:
            optional_kwargs = {
                'use_threads': True,
                'n_jobs': self._config.number_of_processes
            }

        return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2,
                                   self._config.max_n_matches,
                                   self._config.min_similarity,
                                   **optional_kwargs)
Ejemplo n.º 8
0
    def make_non_zero_information(
            self, weight_csr_matrix: csr_matrix) -> List[ROW_COL_VAL]:
        """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple.

        :param weight_csr_matrix:
        :return:
        """
        assert isinstance(weight_csr_matrix, (csr_matrix, ndarray))

        row_col_index_array = weight_csr_matrix.nonzero()
        row_indexes = row_col_index_array[0]
        column_indexes = row_col_index_array[1]
        assert len(row_indexes) == len(column_indexes)

        value_index_items = [None] * len(
            row_indexes)  # type: List[ROW_COL_VAL]
        for i in range(0, len(row_indexes)):
            value_index_items[i] = ROW_COL_VAL(
                row_indexes[i], column_indexes[i],
                self.__get_value_index(row_indexes[i], column_indexes[i],
                                       weight_csr_matrix))
        return value_index_items
def make_non_zero_information(weight_csr_matrix:csr_matrix):
    """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple.

    :param weight_csr_matrix:
    :return:
    """
    assert isinstance(weight_csr_matrix, csr_matrix)

    row_col_index_array = weight_csr_matrix.nonzero()
    row_indexes = row_col_index_array[0]
    column_indexes = row_col_index_array[1]
    assert len(row_indexes) == len(column_indexes)

    value_index_items = [
        ROW_COL_VAL(
            row_indexes[i],
            column_indexes[i],
            __get_value_index(row_indexes[i], column_indexes[i], weight_csr_matrix)
        )
        for i
        in range(0, len(row_indexes))]

    return value_index_items
 def initialize_memmap_object(self, matrix_object:csr_matrix, path_memmap_object:str)->memmap:
     fp = memmap(path_memmap_object, dtype='float64', mode='w+', shape=matrix_object.shape)
     fp[:] = matrix_object.todense()[:]
     return fp
Ejemplo n.º 11
0
 def matrix(self, matrix: csr_matrix):
     if matrix.size:
         self.__matrix = matrix.todense()
def matrix_vector_product(matrix: csr_matrix,
                          vector: np.ndarray) -> np.ndarray:
    return matrix.dot(vector)
Ejemplo n.º 13
0
 def transform(self, X: csr_matrix) -> coo_matrix:
     return X.multiply(self.r)