def test_nmf(self):
        test_cases = [np.mat([[1,2,3],[2,4,6],[4,17,13]], dtype = np.double),
                      np.mat([[1,0,0]], dtype = np.double)]

        for in_mat in test_cases:
            red = Nmf(2)
            d_mat = DenseMatrix(in_mat)
            #wd_init, hd_init = red.random_init(d_mat)
            wd_init, hd_init = red.v_col_init(d_mat)

            s_mat = SparseMatrix(in_mat)
            ws_init = SparseMatrix(wd_init)
            hs_init = SparseMatrix(hd_init)

            wd_mat, hd_mat = Linalg.nmf(d_mat, wd_init, hd_init)
            ws_mat, hs_mat = Linalg.nmf(s_mat, ws_init, hs_init)

            #TESTED IT AGAINST MATLAB IMPLEMENTATION - ALL GOOD
            #print wd_mat.mat
            #print hd_mat.mat
            #print ws_mat.mat.todense()
            #print hs_mat.mat.todense()
            print "V:", in_mat
            print "WH:", (ws_mat*hs_mat).mat.todense()

            np.testing.assert_array_almost_equal(wd_mat.mat,
                                                 ws_mat.mat.todense(), 2)
            np.testing.assert_array_almost_equal(hd_mat.mat,
                                                 hs_mat.mat.todense(), 2)
Esempio n. 2
0
    def test_top_feat_selection(self):
        test_cases = [
            (self.a, np.mat([[3, 1], [5, 4]]), [2, 0], 2),
            (self.a, np.mat([[3], [5]]), [2], 1),
            (self.a, np.mat([[3, 1, 2], [5, 4, 0]]), [2, 0, 1], 6),
        ]

        for in_mat, expected_mat, expected_perm, no_cols in test_cases:
            fs = TopFeatureSelection(no_cols)

            out_mat, perm = fs.apply(DenseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat, expected_mat)
            self.assertListEqual(perm, expected_perm)

            out_mat, perm = fs.apply(SparseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat.todense(), expected_mat)
            self.assertListEqual(perm, expected_perm)

            fs = TopFeatureSelection(no_cols, criterion="length")

            out_mat, perm = fs.apply(DenseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat, expected_mat)
            self.assertListEqual(perm, expected_perm)

            out_mat, perm = fs.apply(SparseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat.todense(), expected_mat)
            self.assertListEqual(perm, expected_perm)

        self.assertRaises(ValueError, TopFeatureSelection, 0)
        self.assertRaises(ValueError,
                          TopFeatureSelection,
                          2,
                          criterion="something")
Esempio n. 3
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.b = np.array([[0, 0, 0], [0, 0, 0]])

        self.c = np.array([[0, 0], [0, 0], [0, 0]])
        self.d = np.array([[1, 0], [0, 1]])
        self.e = np.array([1, 10])
        self.f = np.array([1, 10, 100])

        self.matrix_a = SparseMatrix(self.a)
        self.matrix_b = SparseMatrix(self.b)

        self.matrix_c = SparseMatrix(self.c)
        self.matrix_d = SparseMatrix(self.d)
Esempio n. 4
0
    def test_sparse_lstsq_regression(self):

        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = SparseMatrix(m)
            id_ = SparseMatrix.identity(m1.shape[0])

            res = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res.mat.todense(), m_inv, 7)

            approx1 = (m1 * res).mat.todense()

            res2 = Linalg.lstsq_regression(m1, id_, intercept=True)
            new_a = m1.hstack(SparseMatrix(np.ones((m1.shape[0], 1))))

            approx2 = (new_a * res2).mat.todense()
Esempio n. 5
0
    def test_sparse_lstsq_regression(self):

        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = SparseMatrix(m)
            id_ = SparseMatrix.identity(m1.shape[0])

            res = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res.mat.todense(), m_inv, 7)

            approx1 = (m1 * res).mat.todense()

            res2 = Linalg.lstsq_regression(m1, id_, intercept=True)
            new_a = m1.hstack(SparseMatrix(np.ones((m1.shape[0], 1))))

            approx2 = (new_a * res2).mat.todense()
Esempio n. 6
0
def main():
    """
    Transform EPMI matrix in npz format to SPPMI space and save as pickle file.
    """

    # Get the arguments
    args = docopt(
        '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file.

    Usage:
        transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k>

        <spacePrefix> = path to npz without suffix
        <outPath> = output path for space
        <k> = shifting parameter
    
    ''')

    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get npz matrix
    with np.load(spacePrefix + '.npz') as loader:
        matrix = csr_matrix(
            (loader['data'], loader['indices'], loader['indptr']),
            shape=loader['shape'])

    with open(spacePrefix + '.words.vocab') as f:
        id2row = vocab = [line.strip() for line in f if len(line) > 0]

    with open(spacePrefix + '.contexts.vocab') as f:
        id2column = [line.strip() for line in f if len(line) > 0]

    # Apply log weighting
    matrix.data = np.log(matrix.data)

    # Shift values
    matrix.data -= np.log(k)

    # Eliminate negative counts
    matrix.data[matrix.data <= 0] = 0.0

    # Eliminate zero counts
    matrix.eliminate_zeros()

    # Create new space
    sparseSpace = Space(SparseMatrix(matrix), id2row, id2column)

    #print sparseSpace.get_cooccurrence_matrix()

    # Save the Space object in pickle format
    save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 7
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.space_s = Space(SparseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])

        self.space_d = Space(DenseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])
Esempio n. 8
0
    def test_get_item(self):

        out_mat = SparseMatrix(self.a)[0, :]
        np.testing.assert_array_equal(out_mat.mat.todense(),
                                      np.mat(self.a[0, :]))

        out_int = SparseMatrix(self.a)[0, 1]
        self.assertEqual(out_int, 2)

        out_mat = SparseMatrix(self.a)[0, 1:2]
        np.testing.assert_array_equal(out_mat.mat.todense(),
                                      np.mat(self.a[0, 1:2]))

        out_mat = SparseMatrix(self.a)[0]
        np.testing.assert_array_equal(out_mat.mat.todense(),
                                      np.mat(self.a[0, :]))
Esempio n. 9
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix:
    :param dsm:
    """
    
    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    with np.load(dsm_prefix + 'cooc.npz') as loader:
        coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

    cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

    with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
        row2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
        id2row = pickle.load(f_in)

    with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
        column2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
        id2column = pickle.load(f_in)

    return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
Esempio n. 10
0
def main():
    """
    Convert temporal referencing matrix to regular (binned) matrix.
    """

    # Get the arguments
    args = docopt(
        """Convert temporal referencing matrix to regular (binned) matrix.

    Usage:
        tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath>

        <spacePrefix> = path to pickled space without suffix
        <ref> = reference string
        <outPath> = output path for result file

    Options:
        -w, --w2v   save in w2v format
        -s, --sps   save in sparse matrix format
        
    """)

    is_w2v = args['--w2v']
    is_sps = args['--sps']
    spacePrefix = args['<spacePrefix>']
    ref = args['<ref>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space = load_pkl_files(spacePrefix)
    matrix = space.get_cooccurrence_matrix().get_mat()
    id2row = space.get_id2row()
    id2column = space.get_id2column()

    ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')]
          if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)]
    targets, indices = zip(*ti)

    new_matrix = matrix[list(indices), :]

    # Save the Space objects
    if is_w2v:
        new_space = Space(DenseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=True)
    if is_sps:
        new_space = Space(SparseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=False)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 11
0
def to_matrix(matrix_):
    """
    Converts an array-like structure to a DenseMatrix/SparseMatrix
    """
    if issparse(matrix_):
        return SparseMatrix(matrix_)
    else:
        return DenseMatrix(matrix_)
Esempio n. 12
0
    def test_mul_raises(self):
        test_cases = [(self.matrix_a, self.a),
                      (self.matrix_a, SparseMatrix(self.a)),
                      (self.matrix_a, "3"),
                      ("3", self.matrix_a)]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__mul__, term2)
Esempio n. 13
0
    def test_div_raises(self):
        test_cases = [(self.matrix_a, self.a, TypeError),
                      (self.matrix_a, SparseMatrix(self.a), TypeError),
                      (self.matrix_a, "3", TypeError),
                      (self.matrix_a, 0, ZeroDivisionError)
                      ]

        for (term1, term2, error_type) in test_cases:
            self.assertRaises(error_type, term1.__div__, term2)
Esempio n. 14
0
    def test_multiply_raises(self):

        test_cases = [(self.matrix_a, self.matrix_d, ValueError),
                      (self.matrix_a, self.a, TypeError),
                      (self.matrix_a, SparseMatrix(self.a), TypeError),
                      ]

        for (term1, term2, error_type) in test_cases:
            self.assertRaises(error_type, term1.multiply, term2)
Esempio n. 15
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix: the prefix of the input files (.pkl, .rows, .cols)
    """

    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    if os.path.isfile(dsm_prefix + '.npz'):
        with np.load(dsm_prefix + '.npz') as loader:
            coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

        cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

        with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
            row2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
            id2row = pickle.load(f_in)

        with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
            column2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
            id2column = pickle.load(f_in)

        return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)

    if os.path.isfile(dsm_prefix + '.tsv'):
        values = np.loadtxt(dsm_prefix + '.tsv', dtype=float, delimiter='\t', skiprows=0, comments=None, encoding='utf-8')
        targets = np.loadtxt(dsm_prefix + '.rows', dtype=str, skiprows=0, comments=None, encoding='utf-8')
        # Convert to space in sparse matrix format        
        return Space(SparseMatrix(values), list(targets), [])
    
    # If everything fails try to load it as single w2v file
    space_array = np.loadtxt(dsm_prefix + '.w2v', dtype=object, delimiter=' ', skiprows=1, comments=None, encoding='utf-8')
    targets = space_array[:,0].flatten()
    values = space_array[:,1:].astype(np.float)
    # Convert to space and sparse matrix format        
    return Space(SparseMatrix(values), list(targets), [])
Esempio n. 16
0
    def test_plog(self):
        m = SparseMatrix(np.mat([[0.5, 1.0, 1.5], [2.0, 0.0, 2.5]]))
        m_expected = np.mat([[0., 0., 0.4054], [0.6931, 0., 0.9162]])
        a_expected = np.mat([[0., 0.6931, 1.0986], [1.3862, 0., 1.6094]])
        test_cases = [(self.matrix_a.copy(), a_expected), (m, m_expected)]

        for (term, expected) in test_cases:
            term.plog()
            numpy.testing.assert_array_almost_equal(term.mat.todense(),
                                                    expected, 3)
Esempio n. 17
0
    def _sparse_svd(matrix_, reduced_dimension):
        #svds from scipy.sparse.linalg
        #RAISES ValueError if the rank is smaller than reduced_dimension + 1
        #TODO : fix this or replace with svdsparse
        #??? eIGENVALUES ARE NOT SORTED!!!!!!
        #IF EVER USE THIS; FIX THE PROBLEMS
        #u, s, vt = svds(matrix_.mat, False, True)
        """
        Patch

        Problem: sparsesvd sometimes returns fewer dimensions that requested.
        It will be no longer needs when sparsesvd will allow
        SVDLIBC parameters as an input (kappa parameter of SVDLIBC has to be
        larger than the default. e.g. 1E-05 instead of 1E-06)

        Current fix: ask for more dimensions and remove the unnecessary ones.
        """

        extra_dims = int(reduced_dimension / 10)

        ut, s, vt = sparsesvd(matrix_.mat.tocsc(),
                              reduced_dimension + extra_dims)
        print "sparsesvd"
        print ut
        print s
        print vt
        print "sparsesvd"

        u = SparseMatrix(ut.transpose())
        v = SparseMatrix(vt.transpose())

        no_cols = min(u.shape[1], reduced_dimension)
        u = u[:, 0:no_cols]
        v = v[:, 0:no_cols]

        Linalg._check_reduced_dim(matrix_.shape[1], u.shape[1],
                                  reduced_dimension)

        if not u.is_mostly_positive():
            u = -u
            v = -v

        return u, s[0:no_cols], v
Esempio n. 18
0
    def test_pinv(self):
        test_cases = self.pinv_test_cases

        for in_mat, expected_out in test_cases:
            out_mat = Linalg.pinv(DenseMatrix(in_mat))
            np.testing.assert_array_almost_equal(out_mat.mat, expected_out, 7)

            out_mat = Linalg.pinv(SparseMatrix(in_mat))
            np.testing.assert_array_almost_equal(out_mat.mat.todense(),
                                                 expected_out, 7)
Esempio n. 19
0
    def setUp(self):
        self.m11 = DenseMatrix(np.mat([[3], [9]]))
        self.m21 = DenseMatrix(np.mat([[4], [2]]))
        self.ph1 = DenseMatrix(np.mat([[18], [24]]))

        self.space1 = Space(SparseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"],
                            ["f1", "f2"])
        self.space2 = Space(SparseMatrix(np.mat([[7, 11]])), ["a_b"],
                            ["f1", "f2"])
        self.space3 = Space(SparseMatrix(np.mat([[0, 0]])), ["a_b"],
                            ["f1", "f2"])

        self.space4 = Space(DenseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"],
                            ["f1", "f2"])
        self.space5 = Space(DenseMatrix(np.mat([[7, 11]])), ["a_b"],
                            ["f1", "f2"])
        self.space6 = Space(DenseMatrix(np.mat([[0, 0]])), ["a_b"],
                            ["f1", "f2"])
        self.space7 = Space(DenseMatrix(np.mat([[7, 11], [7, 11]])),
                            ["a_b", "a_a"], ["f1", "f2"])
Esempio n. 20
0
    def test_sparse_svd(self):
        test_cases = self.svd_test_cases

        for x, u_expected, s_expected, v_expected in test_cases:
            for dim in [2, 3, 6]:
                u, s, v = Linalg.svd(SparseMatrix(x), dim)
                np.testing.assert_array_almost_equal(np.abs(u.mat.todense()),
                                                     np.abs(u_expected), 2)
                np.testing.assert_array_almost_equal(np.abs(s),
                                                     np.abs(s_expected), 2)
                np.testing.assert_array_almost_equal(np.abs(v.mat.todense()),
                                                     np.abs(v_expected), 2)

            u, s, v = Linalg.svd(SparseMatrix(x), 1)
            np.testing.assert_array_almost_equal(np.abs(u.mat.todense()),
                                                 np.abs(u_expected[:, 0:1]), 2)
            np.testing.assert_array_almost_equal(np.abs(s),
                                                 np.abs(s_expected[0:1]), 2)
            np.testing.assert_array_almost_equal(np.abs(v.mat.todense()),
                                                 np.abs(v_expected[:, 0:1]), 2)
Esempio n. 21
0
    def _sparse_svd(matrix_, reduced_dimension):
        #svds from scipy.sparse.linalg
        #RAISES ValueError if the rank is smaller than reduced_dimension + 1
        #TODO : fix this or replace with svdsparse
        #??? eIGENVALUES ARE NOT SORTED!!!!!!
        #IF EVER USE THIS; FIX THE PROBLEMS
        #u, s, vt = svds(matrix_.mat, False, True)
        """
        Patch

        Problem: sparsesvd sometimes returns fewer dimensions that requested.
        It will be no longer needs when sparsesvd will allow
        SVDLIBC parameters as an input (kappa parameter of SVDLIBC has to be
        larger than the default. e.g. 1E-05 instead of 1E-06)

        Current fix: ask for more dimensions and remove the unnecessary ones.
        """

        extra_dims = int(reduced_dimension/10)

        ut, s, vt = sparsesvd(matrix_.mat.tocsc(), reduced_dimension + extra_dims)
        print "sparsesvd"
        print ut
        print s
        print vt
        print "sparsesvd"

        u = SparseMatrix(ut.transpose())
        v = SparseMatrix(vt.transpose())

        no_cols = min(u.shape[1], reduced_dimension)
        u = u[:, 0:no_cols]
        v = v[:, 0:no_cols]

        Linalg._check_reduced_dim(matrix_.shape[1], u.shape[1], reduced_dimension)

        if not u.is_mostly_positive():
            u = -u
            v = -v

        return u, s[0:no_cols], v
Esempio n. 22
0
    def test_init(self):
        nparr = self.a
        test_cases = [nparr,
                   np.mat(nparr),
                   csr_matrix(nparr),
                   csc_matrix(nparr),
                   SparseMatrix(nparr)]

        for inmat in test_cases:
            outmat = DenseMatrix(inmat)
            self.assertIsInstance(outmat.mat, np.matrix)
            numpy.testing.assert_array_equal(nparr, np.array(outmat.mat))
Esempio n. 23
0
def read_sparse_space_data(matrix_file, row2id, column2id, dtype=np.double):
    if matrix_file.endswith(".gz"):
        f = gzip.open(matrix_file, "rb")
    else:
        f = open(matrix_file, "rb")

    no_lines = sum(1 for line in f if line.strip() != "")
    f.close()

    row = np.zeros(no_lines, dtype=np.int32)
    col = np.zeros(no_lines, dtype=np.int32)

    data = np.zeros(no_lines, dtype=dtype)

    if matrix_file.endswith(".gz"):
        f = gzip.open(matrix_file, "rb")
    else:
        f = open(matrix_file, "rb")

    i = 0
    for line in f:
        if line.strip() != "":
            line_elements = line.strip().split()
            if len(line_elements) >= 3:
                [word1, word2, count] = line_elements[0:3]
                if word1 in row2id and word2 in column2id:
                    row[i] = row2id[word1]
                    col[i] = column2id[word2]
                    data[i] = dtype(count)
                    i += 1
                    if i % 1000000 == 0:
                        print("Progress...%d" % i)
            #if len(line_elements) > 3:
            #    warn("Invalid input line:%s. Expected 3 fields, ignoring additional ones!" % line.strip())
            else:
                raise ValueError(
                    "Invalid row: %s, expected at least %d fields" %
                    (line.strip(), 3))

    f.close()
    # eliminate the extra zeros created when word1 or word2 is not row2id or col2id!!
    data = data[0:i]
    row = row[0:i]
    col = col[0:i]

    m = SparseMatrix(
        csr_matrix((data, (row, col)), shape=(len(row2id), len(column2id))))
    if m.mat.nnz != i:
        warn(
            "Found 0-counts or duplicate row,column pairs. (Duplicate entries are summed up.)"
        )

    return m
Esempio n. 24
0
 def setUp(self):
     self.a = np.array([[1,2,3],[4,0,5]])
     self.b = np.array([[0,0,0],[0,0,0]])
     
     self.c = np.array([[0,0],[0,0],[0,0]])
     self.d = np.array([[1,0],[0,1]])
     self.e = np.array([1,10])
     self.f = np.array([1,10,100])
     
     self.matrix_a = SparseMatrix(self.a)
     self.matrix_b = SparseMatrix(self.b)
     
     self.matrix_c = SparseMatrix(self.c)
     self.matrix_d = SparseMatrix(self.d)
Esempio n. 25
0
def read_sparse_space_data_mod(input_list, row2id, column2id, dtype=np.double):
    """
    Transform matrix in tuple structure to DISSECT's sparse matrix format
    :param input_list: list of inputs
    :param row2id: dictionary, mapping rows to ids
    :param column2id: dictionary, mapping columns to ids
    :param dtype: data type of cell values
    :return: sparse matrix
    """

    f = input_list

    no_lines = sum(1 for line in f if line != ())

    row = np.zeros(no_lines, dtype=np.int32)
    col = np.zeros(no_lines, dtype=np.int32)

    data = np.zeros(no_lines, dtype=dtype)

    i = 0
    for line in f:
        if line != ():
            line_elements = line
            if len(line_elements) >= 3:
                [word1, word2, count] = line_elements[0:3]
                if word1 in row2id and word2 in column2id:
                    row[i] = row2id[word1]
                    col[i] = column2id[word2]
                    data[i] = dtype(count)
                    i += 1
                    if i % 1000000 == 0:
                        print "Progress...%d" % i
            else:
                raise ValueError(
                    "Invalid row: %s, expected at least %d fields" %
                    (line.strip(), 3))

    # eliminate the extra zeros created when word1 or word2 is not row2id or col2id!!
    data = data[0:i]
    row = row[0:i]
    col = col[0:i]

    m = SparseMatrix(
        csr_matrix((data, (row, col)), shape=(len(row2id), len(column2id))))
    if m.mat.nnz != i:
        warn(
            "Found 0-counts or duplicate row,column pairs. (Duplicate entries are summed up.)"
        )

    return m
Esempio n. 26
0
    def test_sparse_ridge_regression(self):
        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = SparseMatrix(m)
            id_ = SparseMatrix.identity(m1.shape[0])

            res1 = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res1.mat.todense(), m_inv, 7)

            res2 = Linalg.ridge_regression(m1, id_, 1)[0]

            error1 = (m1 * res1 - SparseMatrix(m_inv)).norm()
            error2 = (m1 * res2 - SparseMatrix(m_inv)).norm()

            #print "err", error1, error2

            norm1 = error1 + res1.norm()
            norm2 = error2 + res2.norm()

            #print "norm", norm1, norm2

            #THIS SHOULD HOLD, MAYBE ROUNDIGN ERROR?
            #self.assertGreaterEqual(error2, error1)
            self.assertGreaterEqual(norm1, norm2)
Esempio n. 27
0
    def single_case_test(self, matrix_, expected, w):

        matrix_copy = matrix_.copy()
        dm = DenseMatrix(matrix_)
        sm = SparseMatrix(matrix_)

        out1 = w.apply(dm)
        out2 = w.apply(sm)

        numpy.testing.assert_array_almost_equal(out1.mat, expected, 7)
        numpy.testing.assert_array_almost_equal(out2.mat.todense(), expected,
                                                7)

        numpy.testing.assert_array_equal(dm.mat, matrix_copy)
        numpy.testing.assert_array_equal(matrix_, matrix_copy)
        numpy.testing.assert_array_equal(sm.mat.todense(), matrix_copy)
Esempio n. 28
0
    def test_sparse_ridge_regression(self):
        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = SparseMatrix(m)
            id_ = SparseMatrix.identity(m1.shape[0])

            res1 = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res1.mat.todense(), m_inv, 7)

            res2 = Linalg.ridge_regression(m1, id_, 1)[0]

            error1 = (m1 * res1 - SparseMatrix(m_inv)).norm()
            error2 = (m1 * res2 - SparseMatrix(m_inv)).norm()

            #print "err", error1, error2

            norm1 = error1 + res1.norm()
            norm2 = error2 + res2.norm()

            #print "norm", norm1, norm2

            #THIS SHOULD HOLD, MAYBE ROUNDIGN ERROR?
            #self.assertGreaterEqual(error2, error1)
            self.assertGreaterEqual(norm1, norm2)
Esempio n. 29
0
    def test_resolve_type_conflict(self):

        arr = np.mat([1,2])

        a = DenseMatrix(arr)
        b = SparseMatrix(arr)

        [c,d] = resolve_type_conflict([a,b], DenseMatrix)
        [e,f,g] = resolve_type_conflict([b,a,a], DenseMatrix)
        h = resolve_type_conflict([], DenseMatrix)

        [u,v] = resolve_type_conflict([arr, csr_matrix(arr)], DenseMatrix)

        self.assertIsInstance(c, DenseMatrix)
        self.assertIsInstance(d, DenseMatrix)
        self.assertIsInstance(e, DenseMatrix)
        self.assertIsInstance(f, DenseMatrix)
        self.assertIsInstance(g, DenseMatrix)
        self.assertListEqual([], h)

        self.assertIsInstance(g, DenseMatrix)

        self.assertIsInstance(u, DenseMatrix)
        self.assertIsInstance(v, DenseMatrix)
    def run(self):
        print "Starting " + self.threadName + ". Data size: " + str(len(self.docsTks));
        success_count=0;
        proccessed_count=0;
        res_max=None;
        # Create a zero vector
        zero_mtx = SparseMatrix(sparse.csr_matrix(np.array([0] * space_dim)));
                    
        try:
            for tokens in self.docsTks: # Go through documents
                vecs = None;
                proccessed_count +=1;
                if (tokens==['']):
                    print ' -> Found an empty line at line '+ str(proccessed_count);
                else:
                    try:
                        vecs = my_space.get_rows(tokens);
                    except:
#                        print "!!!!!"
                        new_tokens=[];            
                        for tok in tokens: # Filter tokens that on the keyword set of the space
                            if (tok in key_set):
                                new_tokens.append(tok);
                        if (len(new_tokens)!=0):
                            vecs = my_space.get_rows(new_tokens);
#                print shape;
                if (vecs==None or vecs.get_shape()[0]==0):
                    print self.threadName, " failed to build vects for line: ", tokens, "because vecs= ", vecs;
                    #   Fix error that no vector founded. Replace by a zero vector
                    if (proccessed_count==1): # Add the zero matrix to the result
                        res_max=zero_mtx;
                    else:
                        res_max = res_max.vstack(zero_mtx);
#                    print mtx;
#                    print mtx;
                else:
#                    print vecs.get_shape();
                    shape = vecs.get_shape();
                    if (shape[0]==1):
                        if (proccessed_count==1): # Stack the result
                            res_max = vecs[0];
                        else:
                            res_max = res_max.vstack(SparseMatrix(vecs[0]));
#                            print vecs[0];
                        success_count+=1;
                    elif shape[0]>1:
                        vsum = vecs[0];
                        for i in xrange(1,shape[0]):
                            vsum = vsum + vecs[i];
                        if (proccessed_count==1): # Stack the result
                            res_max = SparseMatrix(vsum);
                        else:
                            res_max= res_max.vstack(SparseMatrix(vsum));
#                            print vsum;
                        success_count+=1;
#                    print res_max;
#                print "\n",self.threadName,"\t",proccessed_count, "\t",tokens;
                if (res_max==None):
                    print "!!!!!!!!!!!!!!!!!!!!!!!!";
                if (proccessed_count%500==0):
                        print "\n",self.threadName ,"successful processed ",success_count, " in ", proccessed_count;
                        #print "--> Current line: ", line;
#                        break;
            self.resMatrix = res_max
            print "\n@@@@@@@@@@@@@@@@@@@@ ",self.resMatrix.get_shape();
            self.nsucceed = success_count;
        except:
            print "ERR on multi threading technique on ",self.threadName;
            traceback.print_exc();
Esempio n. 31
0
 def _sparse_pinv(matrix_):
     # TODO: implement pinv
     return SparseMatrix(np.linalg.pinv(matrix_.mat.todense()))
Esempio n. 32
0
    def test_add_raises(self):
        test_cases = [(self.matrix_a, self.a),
                      (self.matrix_a, SparseMatrix(self.a))]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__add__, term2)
Esempio n. 33
0
    def test_svd_raises(self):
        test_cases = [np.mat([[1, 2, 3], [2, 4, 6], [4, 675, 43]])]

        for x in test_cases:
            self.assertRaises(ValueError, Linalg.svd, DenseMatrix(x), 0)
            self.assertRaises(ValueError, Linalg.svd, SparseMatrix(x), 0)
    def test_svd(self):
        test_cases = [(DenseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
                       np.mat([[  2.19272110e+00,   3.03174768e+00, 0],
                               [  4.38544220e+00,   6.06349536e+00, 0],
                               [  6.76369708e+02,  -4.91431927e-02, 0]]),
                       np.mat([[0.0059,0.9979,0.0636],
                               [0.3255,-0.0621,0.9434],
                               [0.945,0.015,-0.325]]).transpose())]



        for x, us_expected, v_expected in test_cases:

            svd_red = Svd(2)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)

            svd_red = Svd(3)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)

            svd_red = Svd(6)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)

            svd_red = Svd(1)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:1], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:1], 2)


        test_cases = [(SparseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
                       np.mat([[  2.19272110e+00,   3.03174768e+00, 0],
                               [  4.38544220e+00,   6.06349536e+00, 0],
                               [  6.76369708e+02,  -4.91431927e-02, 0]]),
                       np.mat([[0.0059,0.9979,0.0636],
                               [0.3255,-0.0621,0.9434],
                               [0.945,0.015,-0.325]]).transpose())]


        for x, us_expected, v_expected in test_cases:
            us_expected = np.abs(us_expected)
            v_expected = np.abs(v_expected)

            svd_red = Svd(2)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)

            svd_red = Svd(3)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)

            svd_red = Svd(6)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)

            svd_red = Svd(1)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:1], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:1], 2)
Esempio n. 35
0
class TestSparseMatrix(unittest.TestCase):
    
    def setUp(self):
        self.a = np.array([[1,2,3],[4,0,5]])
        self.b = np.array([[0,0,0],[0,0,0]])
        
        self.c = np.array([[0,0],[0,0],[0,0]])
        self.d = np.array([[1,0],[0,1]])
        self.e = np.array([1,10])
        self.f = np.array([1,10,100])
        
        self.matrix_a = SparseMatrix(self.a)
        self.matrix_b = SparseMatrix(self.b)
        
        self.matrix_c = SparseMatrix(self.c)
        self.matrix_d = SparseMatrix(self.d)
        
        
    def tearDown(self):
        pass

    def test_reshape(self):
        
        test_cases = [(self.matrix_a, (1,6), self.a.reshape((1,6))),
                      (self.matrix_a, (3,2), self.a.reshape((3,2))),
                      (self.matrix_b, (1,6), self.b.reshape((1,6))),
                      (self.matrix_b, (6,1), self.b.reshape((6,1))),
                      (self.matrix_b, (2,3), self.b.reshape((2,3))),
                      ] 
        
        for mat, shape, expected in test_cases:
            mat.reshape(shape)
            np.testing.assert_array_equal(mat.mat.todense(), expected)
            self.assertTupleEqual(shape, mat.shape)
        
        
    def test_reshape_raises(self):
            
        test_cases = [(3,0), (3,3), 3, (3,3,3), ("3","5"), (2,"4")]
        
        for shape in test_cases:
            self.assertRaises(ValueError, self.matrix_a.reshape, shape)
        
        
    def test_init(self):
        nparr = self.a
        test_cases = [nparr,
                   np.mat(nparr),
                   csr_matrix(nparr),
                   csc_matrix(nparr),
                   DenseMatrix(nparr)]
        
        for inmat in test_cases:
            outmat = SparseMatrix(inmat)
            self.assertIsInstance(outmat.mat, csr_matrix)
            numpy.testing.assert_array_equal(nparr,
                                             np.array(outmat.mat.todense()))
            
    def test_add(self):
        test_cases = [(self.matrix_a, self.matrix_a, np.mat([[2,4,6],[8,0,10]])),
                      (self.matrix_a, self.matrix_b, np.mat(self.a))
                      ]
        
        for (term1, term2, expected) in test_cases:
            sum_ = term1 + term2
            numpy.testing.assert_array_equal(sum_.mat.todense(), expected)
            self.assertIsInstance(sum_, type(term1))

    def test_add_raises(self):
        test_cases = [(self.matrix_a, self.a),
                      (self.matrix_a, DenseMatrix(self.a))]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__add__, term2)
    
    def test_mul(self):
        test_cases = [(self.matrix_a, self.matrix_c, np.mat([[0,0],[0,0]])),
                      (self.matrix_d, self.matrix_a, self.matrix_a.mat.todense()),
                      (self.matrix_a, 2, np.mat([[2,4,6],[8,0,10]])),
                      (self.matrix_a, np.int64(2), np.mat([[2,4,6],[8,0,10]]))
                      ]
        
        for (term1, term2, expected) in test_cases:
            sum_ = term1 * term2
            numpy.testing.assert_array_equal(sum_.mat.todense(), expected)
            self.assertIsInstance(sum_, type(term1))
    
    def test_mul_raises(self):
        test_cases = [(self.matrix_a, self.a),
                      (self.matrix_a, DenseMatrix(self.a)),
                      (self.matrix_a, "3")]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__mul__, term2)

    def test_get_item(self):
        
        out_mat = SparseMatrix(self.a)[0,:]
        np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:]))
        
        out_int = SparseMatrix(self.a)[0,1]
        self.assertEqual(out_int, 2)
        
        out_mat = SparseMatrix(self.a)[0,1:2]
        np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,1:2]))
        
        out_mat = SparseMatrix(self.a)[0]
        np.testing.assert_array_equal(out_mat.mat.todense(),np.mat(self.a[0,:]))
        
        
    def test_scale_rows(self):
        outcome = np.mat([[1,2,3],[40,0,50]])
        test_cases = [(self.matrix_a.copy(), self.e, outcome),
                      (self.matrix_a.copy(), np.mat(self.e).T, outcome),
                      ]
        
        for (term1, term2, expected) in test_cases:
            term1 = term1.scale_rows(term2)
            numpy.testing.assert_array_equal(term1.mat.todense(), expected)
        
    def test_scale_columns(self):
        test_cases = [(self.matrix_a.copy(), self.f, np.mat([[1,20,300],[4,0,500]]))]
        
        for (term1, term2, expected) in test_cases:
            term1 = term1.scale_columns(term2)
            numpy.testing.assert_array_equal(term1.mat.todense(), expected)
            self.assertIsInstance(term1.mat, csr_matrix)
            
    def test_scale_raises(self):
        test_cases = [(self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows),
                      (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns),
                      (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows),
                      (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns),
                      (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows),
                      ]                      
        for (term1, term2, error_type, function) in test_cases:
            self.assertRaises(error_type, function, term2)
            
    def test_plog(self):
        m = SparseMatrix(np.mat([[0.5,1.0,1.5],[2.0,0.0,2.5]]))
        m_expected = np.mat([[0.,0.,0.4054],[ 0.6931,0.,0.9162]])
        a_expected = np.mat([[0.,0.6931,1.0986],[1.3862,0.,1.6094]])
        test_cases = [(self.matrix_a.copy(), a_expected),
                      (m, m_expected)
                     ]

        for (term, expected) in test_cases:
            term.plog()
            numpy.testing.assert_array_almost_equal(term.mat.todense(), expected, 3)
Esempio n. 36
0
def main():
    """
    Align two sparse matrices by intersecting their columns.
    """

    # Get the arguments
    args = docopt('''Align two sparse matrices by intersecting their columns.

    Usage:
        count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2>

        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space1 without suffix
        <spacePrefix2> = path to pickled space2 without suffix

    Options:
        -l, --len   normalize final vectors to unit length
    
    ''')

    is_len = args['--len']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()
    column2id1 = space1.get_column2id()
    column2id2 = space2.get_column2id()
    intersected_columns = list(set(id2column1).intersection(id2column2))
    intersected_columns_id1 = [
        column2id1[item] for item in intersected_columns
    ]
    intersected_columns_id2 = [
        column2id2[item] for item in intersected_columns
    ]
    reduced_matrix1 = space1.get_cooccurrence_matrix(
    )[:, intersected_columns_id1].get_mat()
    reduced_matrix2 = space2.get_cooccurrence_matrix(
    )[:, intersected_columns_id2].get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2)
        l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1)
        reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1)

    reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1,
                           intersected_columns)
    reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2,
                           intersected_columns)

    if reduced_space1.get_id2column() != reduced_space2.get_id2column():
        sys.exit('Two spaces not properly aligned!')

    # Save the Space object in pickle format
    save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True)
    save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))