Example #1
0
    def apply(self, matrix_, column_marginal=None):
        """
        Performs epmi weighting.

        Args:
            matrix_ (Matrix): Input matrix

            column_marginal (np.ndarray): column marginals of the
                core matrix if the matrix is a peripheral matrix

        Returns:
            Matrix: the matrix after applying epmi.

        """

        matrix_.assert_positive()
        row_sum = matrix_.sum(axis = 1)

        if not column_marginal is None:
            col_sum = column_marginal
        else:
            col_sum = matrix_.sum(axis = 0)

        total = col_sum.sum()

        row_sum = nonzero_invert(row_sum)
        col_sum = nonzero_invert(col_sum)
        col_sum = col_sum * total

        matrix_ = matrix_.scale_rows(row_sum)
        matrix_ = matrix_.scale_columns(col_sum)

        return matrix_
    def apply(self, matrix_, column_marginal=None):
        """
        Performs epmi weighting.

        Args:
            matrix_ (Matrix): Input matrix

            column_marginal (np.ndarray): column marginals of the
                core matrix if the matrix is a peripheral matrix

        Returns:
            Matrix: the matrix after applying epmi.

        """

        matrix_.assert_positive()
        row_sum = matrix_.sum(axis=1)

        if not column_marginal is None:
            col_sum = column_marginal
        else:
            col_sum = matrix_.sum(axis=0)

        total = col_sum.sum()

        row_sum = nonzero_invert(row_sum)
        col_sum = nonzero_invert(col_sum)
        col_sum = col_sum * total

        matrix_ = matrix_.scale_rows(row_sum)
        matrix_ = matrix_.scale_columns(col_sum)

        return matrix_
Example #3
0
    def _sims_to_matrix(self, vector, matrix_):
        sims = DotProdSimilarity()._sims_to_matrix(vector, matrix_)

        vector_norm = vector.norm()
        row_norms = vector_norm * matrix_.norm(1)
        row_norms = nonzero_invert(row_norms)

        return sims.scale_rows(row_norms)
Example #4
0
    def apply(self, matrix_):

        if self.criterion == "length":
            row_norms = matrix_.norm(axis=1)
        else:
            row_norms = matrix_.sum(axis=1)

        inv_row_norm = nonzero_invert(row_norms)
        matrix_ = matrix_.scale_rows(inv_row_norm)
        return matrix_
    def apply(self, matrix_):

        if self.criterion == "length":
            row_norms = matrix_.norm(axis=1)
        else:
            row_norms = matrix_.sum(axis=1)

        inv_row_norm = nonzero_invert(row_norms)
        matrix_ = matrix_.scale_rows(inv_row_norm)
        return matrix_
Example #6
0
def main():
    """
    Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in

      Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3.

    """

    # Get the arguments
    args = docopt(
        '''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format.

    Usage:
        ppmi.py [-l] <dsm_prefix> <k> <alpha> <outPath>

        <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi)
        <k> = shifting parameter
        <alpha> = smoothing parameter
        <outPath> = output path for space

    Options:
        -l, --len   normalize final vectors to unit length

    ''')

    is_len = args['--len']
    dsm_prefix = args['<dsm_prefix>']
    k = int(args['<k>'])
    alpha = float(args['<alpha>'])
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get space with sparse matrix
    dsm = load_pkl_files(dsm_prefix)
    id2row = dsm.get_id2row()
    id2column = dsm.get_id2column()

    # Get probabilities
    matrix_ = dsm.cooccurrence_matrix

    matrix_.assert_positive()
    row_sum = matrix_.sum(axis=1)
    col_sum = matrix_.sum(axis=0)

    # Compute smoothed P_alpha(c)
    smooth_col_sum = np.power(col_sum, alpha)
    col_sum = smooth_col_sum / smooth_col_sum.sum()

    # Compute P(w)
    row_sum = nonzero_invert(row_sum)
    col_sum = nonzero_invert(col_sum)

    # Apply epmi weighting (without log)
    matrix_ = matrix_.scale_rows(row_sum)
    matrix_ = matrix_.scale_columns(col_sum)

    # Apply log weighting
    matrix_.mat.data = np.log(matrix_.mat.data)

    # Shift values
    matrix_.mat.data -= np.log(k)

    # Eliminate negative counts
    matrix_.mat.data[matrix_.mat.data <= 0] = 0.0

    # Eliminate zero counts
    matrix_.mat.eliminate_zeros()

    matrix_ = matrix_.get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(matrix_, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        matrix_ /= l2norm1.reshape(len(l2norm1), 1)

    dsm = Space(SparseMatrix(matrix_), id2row, id2column)

    # Save the Space object in pickle format
    save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False)
    logging.info("--- %s seconds ---" % (time.time() - start_time))