Esempio n. 1
0
    def test_top_feat_selection(self):
        test_cases = [
            (self.a, np.mat([[3, 1], [5, 4]]), [2, 0], 2),
            (self.a, np.mat([[3], [5]]), [2], 1),
            (self.a, np.mat([[3, 1, 2], [5, 4, 0]]), [2, 0, 1], 6),
        ]

        for in_mat, expected_mat, expected_perm, no_cols in test_cases:
            fs = TopFeatureSelection(no_cols)

            out_mat, perm = fs.apply(DenseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat, expected_mat)
            self.assertListEqual(perm, expected_perm)

            out_mat, perm = fs.apply(SparseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat.todense(), expected_mat)
            self.assertListEqual(perm, expected_perm)

            fs = TopFeatureSelection(no_cols, criterion="length")

            out_mat, perm = fs.apply(DenseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat, expected_mat)
            self.assertListEqual(perm, expected_perm)

            out_mat, perm = fs.apply(SparseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat.todense(), expected_mat)
            self.assertListEqual(perm, expected_perm)

        self.assertRaises(ValueError, TopFeatureSelection, 0)
        self.assertRaises(ValueError,
                          TopFeatureSelection,
                          2,
                          criterion="something")
    def test_nmf(self):
        test_cases = [np.mat([[1,2,3],[2,4,6],[4,17,13]], dtype = np.double),
                      np.mat([[1,0,0]], dtype = np.double)]

        for in_mat in test_cases:
            red = Nmf(2)
            d_mat = DenseMatrix(in_mat)
            #wd_init, hd_init = red.random_init(d_mat)
            wd_init, hd_init = red.v_col_init(d_mat)

            s_mat = SparseMatrix(in_mat)
            ws_init = SparseMatrix(wd_init)
            hs_init = SparseMatrix(hd_init)

            wd_mat, hd_mat = Linalg.nmf(d_mat, wd_init, hd_init)
            ws_mat, hs_mat = Linalg.nmf(s_mat, ws_init, hs_init)

            #TESTED IT AGAINST MATLAB IMPLEMENTATION - ALL GOOD
            #print wd_mat.mat
            #print hd_mat.mat
            #print ws_mat.mat.todense()
            #print hs_mat.mat.todense()
            print "V:", in_mat
            print "WH:", (ws_mat*hs_mat).mat.todense()

            np.testing.assert_array_almost_equal(wd_mat.mat,
                                                 ws_mat.mat.todense(), 2)
            np.testing.assert_array_almost_equal(hd_mat.mat,
                                                 hs_mat.mat.todense(), 2)
Esempio n. 3
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.b = np.array([[0, 0, 0], [0, 0, 0]])

        self.c = np.array([[0, 0], [0, 0], [0, 0]])
        self.d = np.array([[1, 0], [0, 1]])
        self.e = np.array([1, 10])
        self.f = np.array([1, 10, 100])

        self.matrix_a = SparseMatrix(self.a)
        self.matrix_b = SparseMatrix(self.b)

        self.matrix_c = SparseMatrix(self.c)
        self.matrix_d = SparseMatrix(self.d)
Esempio n. 4
0
    def test_get_item(self):

        out_mat = SparseMatrix(self.a)[0, :]
        np.testing.assert_array_equal(out_mat.mat.todense(),
                                      np.mat(self.a[0, :]))

        out_int = SparseMatrix(self.a)[0, 1]
        self.assertEqual(out_int, 2)

        out_mat = SparseMatrix(self.a)[0, 1:2]
        np.testing.assert_array_equal(out_mat.mat.todense(),
                                      np.mat(self.a[0, 1:2]))

        out_mat = SparseMatrix(self.a)[0]
        np.testing.assert_array_equal(out_mat.mat.todense(),
                                      np.mat(self.a[0, :]))
Esempio n. 5
0
def main():
    """
    Convert temporal referencing matrix to regular (binned) matrix.
    """

    # Get the arguments
    args = docopt(
        """Convert temporal referencing matrix to regular (binned) matrix.

    Usage:
        tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath>

        <spacePrefix> = path to pickled space without suffix
        <ref> = reference string
        <outPath> = output path for result file

    Options:
        -w, --w2v   save in w2v format
        -s, --sps   save in sparse matrix format
        
    """)

    is_w2v = args['--w2v']
    is_sps = args['--sps']
    spacePrefix = args['<spacePrefix>']
    ref = args['<ref>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space = load_pkl_files(spacePrefix)
    matrix = space.get_cooccurrence_matrix().get_mat()
    id2row = space.get_id2row()
    id2column = space.get_id2column()

    ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')]
          if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)]
    targets, indices = zip(*ti)

    new_matrix = matrix[list(indices), :]

    # Save the Space objects
    if is_w2v:
        new_space = Space(DenseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=True)
    if is_sps:
        new_space = Space(SparseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=False)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 6
0
    def test_sparse_lstsq_regression(self):

        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = SparseMatrix(m)
            id_ = SparseMatrix.identity(m1.shape[0])

            res = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res.mat.todense(), m_inv, 7)

            approx1 = (m1 * res).mat.todense()

            res2 = Linalg.lstsq_regression(m1, id_, intercept=True)
            new_a = m1.hstack(SparseMatrix(np.ones((m1.shape[0], 1))))

            approx2 = (new_a * res2).mat.todense()
Esempio n. 7
0
def main():
    """
    Transform EPMI matrix in npz format to SPPMI space and save as pickle file.
    """

    # Get the arguments
    args = docopt(
        '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file.

    Usage:
        transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k>

        <spacePrefix> = path to npz without suffix
        <outPath> = output path for space
        <k> = shifting parameter
    
    ''')

    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get npz matrix
    with np.load(spacePrefix + '.npz') as loader:
        matrix = csr_matrix(
            (loader['data'], loader['indices'], loader['indptr']),
            shape=loader['shape'])

    with open(spacePrefix + '.words.vocab') as f:
        id2row = vocab = [line.strip() for line in f if len(line) > 0]

    with open(spacePrefix + '.contexts.vocab') as f:
        id2column = [line.strip() for line in f if len(line) > 0]

    # Apply log weighting
    matrix.data = np.log(matrix.data)

    # Shift values
    matrix.data -= np.log(k)

    # Eliminate negative counts
    matrix.data[matrix.data <= 0] = 0.0

    # Eliminate zero counts
    matrix.eliminate_zeros()

    # Create new space
    sparseSpace = Space(SparseMatrix(matrix), id2row, id2column)

    #print sparseSpace.get_cooccurrence_matrix()

    # Save the Space object in pickle format
    save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 8
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.space_s = Space(SparseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])

        self.space_d = Space(DenseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])
Esempio n. 9
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix:
    :param dsm:
    """
    
    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    with np.load(dsm_prefix + 'cooc.npz') as loader:
        coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

    cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

    with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
        row2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
        id2row = pickle.load(f_in)

    with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
        column2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
        id2column = pickle.load(f_in)

    return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
Esempio n. 10
0
    def test_mul_raises(self):
        test_cases = [(self.matrix_a, self.a),
                      (self.matrix_a, SparseMatrix(self.a)),
                      (self.matrix_a, "3"),
                      ("3", self.matrix_a)]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__mul__, term2)
Esempio n. 11
0
def to_matrix(matrix_):
    """
    Converts an array-like structure to a DenseMatrix/SparseMatrix
    """
    if issparse(matrix_):
        return SparseMatrix(matrix_)
    else:
        return DenseMatrix(matrix_)
Esempio n. 12
0
    def test_multiply_raises(self):

        test_cases = [(self.matrix_a, self.matrix_d, ValueError),
                      (self.matrix_a, self.a, TypeError),
                      (self.matrix_a, SparseMatrix(self.a), TypeError),
                      ]

        for (term1, term2, error_type) in test_cases:
            self.assertRaises(error_type, term1.multiply, term2)
Esempio n. 13
0
    def test_div_raises(self):
        test_cases = [(self.matrix_a, self.a, TypeError),
                      (self.matrix_a, SparseMatrix(self.a), TypeError),
                      (self.matrix_a, "3", TypeError),
                      (self.matrix_a, 0, ZeroDivisionError)
                      ]

        for (term1, term2, error_type) in test_cases:
            self.assertRaises(error_type, term1.__div__, term2)
Esempio n. 14
0
    def test_pinv(self):
        test_cases = self.pinv_test_cases

        for in_mat, expected_out in test_cases:
            out_mat = Linalg.pinv(DenseMatrix(in_mat))
            np.testing.assert_array_almost_equal(out_mat.mat, expected_out, 7)

            out_mat = Linalg.pinv(SparseMatrix(in_mat))
            np.testing.assert_array_almost_equal(out_mat.mat.todense(),
                                                 expected_out, 7)
Esempio n. 15
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix: the prefix of the input files (.pkl, .rows, .cols)
    """

    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    if os.path.isfile(dsm_prefix + '.npz'):
        with np.load(dsm_prefix + '.npz') as loader:
            coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

        cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

        with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
            row2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
            id2row = pickle.load(f_in)

        with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
            column2id = pickle.load(f_in)

        with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
            id2column = pickle.load(f_in)

        return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)

    if os.path.isfile(dsm_prefix + '.tsv'):
        values = np.loadtxt(dsm_prefix + '.tsv', dtype=float, delimiter='\t', skiprows=0, comments=None, encoding='utf-8')
        targets = np.loadtxt(dsm_prefix + '.rows', dtype=str, skiprows=0, comments=None, encoding='utf-8')
        # Convert to space in sparse matrix format        
        return Space(SparseMatrix(values), list(targets), [])
    
    # If everything fails try to load it as single w2v file
    space_array = np.loadtxt(dsm_prefix + '.w2v', dtype=object, delimiter=' ', skiprows=1, comments=None, encoding='utf-8')
    targets = space_array[:,0].flatten()
    values = space_array[:,1:].astype(np.float)
    # Convert to space and sparse matrix format        
    return Space(SparseMatrix(values), list(targets), [])
Esempio n. 16
0
    def test_plog(self):
        m = SparseMatrix(np.mat([[0.5, 1.0, 1.5], [2.0, 0.0, 2.5]]))
        m_expected = np.mat([[0., 0., 0.4054], [0.6931, 0., 0.9162]])
        a_expected = np.mat([[0., 0.6931, 1.0986], [1.3862, 0., 1.6094]])
        test_cases = [(self.matrix_a.copy(), a_expected), (m, m_expected)]

        for (term, expected) in test_cases:
            term.plog()
            numpy.testing.assert_array_almost_equal(term.mat.todense(),
                                                    expected, 3)
Esempio n. 17
0
    def _sparse_svd(matrix_, reduced_dimension):
        #svds from scipy.sparse.linalg
        #RAISES ValueError if the rank is smaller than reduced_dimension + 1
        #TODO : fix this or replace with svdsparse
        #??? eIGENVALUES ARE NOT SORTED!!!!!!
        #IF EVER USE THIS; FIX THE PROBLEMS
        #u, s, vt = svds(matrix_.mat, False, True)
        """
        Patch

        Problem: sparsesvd sometimes returns fewer dimensions that requested.
        It will be no longer needs when sparsesvd will allow
        SVDLIBC parameters as an input (kappa parameter of SVDLIBC has to be
        larger than the default. e.g. 1E-05 instead of 1E-06)

        Current fix: ask for more dimensions and remove the unnecessary ones.
        """

        extra_dims = int(reduced_dimension / 10)

        ut, s, vt = sparsesvd(matrix_.mat.tocsc(),
                              reduced_dimension + extra_dims)
        print "sparsesvd"
        print ut
        print s
        print vt
        print "sparsesvd"

        u = SparseMatrix(ut.transpose())
        v = SparseMatrix(vt.transpose())

        no_cols = min(u.shape[1], reduced_dimension)
        u = u[:, 0:no_cols]
        v = v[:, 0:no_cols]

        Linalg._check_reduced_dim(matrix_.shape[1], u.shape[1],
                                  reduced_dimension)

        if not u.is_mostly_positive():
            u = -u
            v = -v

        return u, s[0:no_cols], v
Esempio n. 18
0
    def setUp(self):
        self.m11 = DenseMatrix(np.mat([[3], [9]]))
        self.m21 = DenseMatrix(np.mat([[4], [2]]))
        self.ph1 = DenseMatrix(np.mat([[18], [24]]))

        self.space1 = Space(SparseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"],
                            ["f1", "f2"])
        self.space2 = Space(SparseMatrix(np.mat([[7, 11]])), ["a_b"],
                            ["f1", "f2"])
        self.space3 = Space(SparseMatrix(np.mat([[0, 0]])), ["a_b"],
                            ["f1", "f2"])

        self.space4 = Space(DenseMatrix(np.mat([[3, 9], [4, 2]])), ["a", "b"],
                            ["f1", "f2"])
        self.space5 = Space(DenseMatrix(np.mat([[7, 11]])), ["a_b"],
                            ["f1", "f2"])
        self.space6 = Space(DenseMatrix(np.mat([[0, 0]])), ["a_b"],
                            ["f1", "f2"])
        self.space7 = Space(DenseMatrix(np.mat([[7, 11], [7, 11]])),
                            ["a_b", "a_a"], ["f1", "f2"])
Esempio n. 19
0
    def test_sparse_svd(self):
        test_cases = self.svd_test_cases

        for x, u_expected, s_expected, v_expected in test_cases:
            for dim in [2, 3, 6]:
                u, s, v = Linalg.svd(SparseMatrix(x), dim)
                np.testing.assert_array_almost_equal(np.abs(u.mat.todense()),
                                                     np.abs(u_expected), 2)
                np.testing.assert_array_almost_equal(np.abs(s),
                                                     np.abs(s_expected), 2)
                np.testing.assert_array_almost_equal(np.abs(v.mat.todense()),
                                                     np.abs(v_expected), 2)

            u, s, v = Linalg.svd(SparseMatrix(x), 1)
            np.testing.assert_array_almost_equal(np.abs(u.mat.todense()),
                                                 np.abs(u_expected[:, 0:1]), 2)
            np.testing.assert_array_almost_equal(np.abs(s),
                                                 np.abs(s_expected[0:1]), 2)
            np.testing.assert_array_almost_equal(np.abs(v.mat.todense()),
                                                 np.abs(v_expected[:, 0:1]), 2)
Esempio n. 20
0
def read_sparse_space_data(matrix_file, row2id, column2id, dtype=np.double):
    if matrix_file.endswith(".gz"):
        f = gzip.open(matrix_file, "rb")
    else:
        f = open(matrix_file, "rb")

    no_lines = sum(1 for line in f if line.strip() != "")
    f.close()

    row = np.zeros(no_lines, dtype=np.int32)
    col = np.zeros(no_lines, dtype=np.int32)

    data = np.zeros(no_lines, dtype=dtype)

    if matrix_file.endswith(".gz"):
        f = gzip.open(matrix_file, "rb")
    else:
        f = open(matrix_file, "rb")

    i = 0
    for line in f:
        if line.strip() != "":
            line_elements = line.strip().split()
            if len(line_elements) >= 3:
                [word1, word2, count] = line_elements[0:3]
                if word1 in row2id and word2 in column2id:
                    row[i] = row2id[word1]
                    col[i] = column2id[word2]
                    data[i] = dtype(count)
                    i += 1
                    if i % 1000000 == 0:
                        print("Progress...%d" % i)
            #if len(line_elements) > 3:
            #    warn("Invalid input line:%s. Expected 3 fields, ignoring additional ones!" % line.strip())
            else:
                raise ValueError(
                    "Invalid row: %s, expected at least %d fields" %
                    (line.strip(), 3))

    f.close()
    # eliminate the extra zeros created when word1 or word2 is not row2id or col2id!!
    data = data[0:i]
    row = row[0:i]
    col = col[0:i]

    m = SparseMatrix(
        csr_matrix((data, (row, col)), shape=(len(row2id), len(column2id))))
    if m.mat.nnz != i:
        warn(
            "Found 0-counts or duplicate row,column pairs. (Duplicate entries are summed up.)"
        )

    return m
Esempio n. 21
0
    def test_init(self):
        nparr = self.a
        test_cases = [nparr,
                   np.mat(nparr),
                   csr_matrix(nparr),
                   csc_matrix(nparr),
                   SparseMatrix(nparr)]

        for inmat in test_cases:
            outmat = DenseMatrix(inmat)
            self.assertIsInstance(outmat.mat, np.matrix)
            numpy.testing.assert_array_equal(nparr, np.array(outmat.mat))
Esempio n. 22
0
def read_sparse_space_data_mod(input_list, row2id, column2id, dtype=np.double):
    """
    Transform matrix in tuple structure to DISSECT's sparse matrix format
    :param input_list: list of inputs
    :param row2id: dictionary, mapping rows to ids
    :param column2id: dictionary, mapping columns to ids
    :param dtype: data type of cell values
    :return: sparse matrix
    """

    f = input_list

    no_lines = sum(1 for line in f if line != ())

    row = np.zeros(no_lines, dtype=np.int32)
    col = np.zeros(no_lines, dtype=np.int32)

    data = np.zeros(no_lines, dtype=dtype)

    i = 0
    for line in f:
        if line != ():
            line_elements = line
            if len(line_elements) >= 3:
                [word1, word2, count] = line_elements[0:3]
                if word1 in row2id and word2 in column2id:
                    row[i] = row2id[word1]
                    col[i] = column2id[word2]
                    data[i] = dtype(count)
                    i += 1
                    if i % 1000000 == 0:
                        print "Progress...%d" % i
            else:
                raise ValueError(
                    "Invalid row: %s, expected at least %d fields" %
                    (line.strip(), 3))

    # eliminate the extra zeros created when word1 or word2 is not row2id or col2id!!
    data = data[0:i]
    row = row[0:i]
    col = col[0:i]

    m = SparseMatrix(
        csr_matrix((data, (row, col)), shape=(len(row2id), len(column2id))))
    if m.mat.nnz != i:
        warn(
            "Found 0-counts or duplicate row,column pairs. (Duplicate entries are summed up.)"
        )

    return m
Esempio n. 23
0
    def test_sparse_ridge_regression(self):
        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = SparseMatrix(m)
            id_ = SparseMatrix.identity(m1.shape[0])

            res1 = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res1.mat.todense(), m_inv, 7)

            res2 = Linalg.ridge_regression(m1, id_, 1)[0]

            error1 = (m1 * res1 - SparseMatrix(m_inv)).norm()
            error2 = (m1 * res2 - SparseMatrix(m_inv)).norm()

            #print "err", error1, error2

            norm1 = error1 + res1.norm()
            norm2 = error2 + res2.norm()

            #print "norm", norm1, norm2

            #THIS SHOULD HOLD, MAYBE ROUNDIGN ERROR?
            #self.assertGreaterEqual(error2, error1)
            self.assertGreaterEqual(norm1, norm2)
Esempio n. 24
0
    def single_case_test(self, matrix_, expected, w):

        matrix_copy = matrix_.copy()
        dm = DenseMatrix(matrix_)
        sm = SparseMatrix(matrix_)

        out1 = w.apply(dm)
        out2 = w.apply(sm)

        numpy.testing.assert_array_almost_equal(out1.mat, expected, 7)
        numpy.testing.assert_array_almost_equal(out2.mat.todense(), expected,
                                                7)

        numpy.testing.assert_array_equal(dm.mat, matrix_copy)
        numpy.testing.assert_array_equal(matrix_, matrix_copy)
        numpy.testing.assert_array_equal(sm.mat.todense(), matrix_copy)
Esempio n. 25
0
    def test_resolve_type_conflict(self):

        arr = np.mat([1,2])

        a = DenseMatrix(arr)
        b = SparseMatrix(arr)

        [c,d] = resolve_type_conflict([a,b], DenseMatrix)
        [e,f,g] = resolve_type_conflict([b,a,a], DenseMatrix)
        h = resolve_type_conflict([], DenseMatrix)

        [u,v] = resolve_type_conflict([arr, csr_matrix(arr)], DenseMatrix)

        self.assertIsInstance(c, DenseMatrix)
        self.assertIsInstance(d, DenseMatrix)
        self.assertIsInstance(e, DenseMatrix)
        self.assertIsInstance(f, DenseMatrix)
        self.assertIsInstance(g, DenseMatrix)
        self.assertListEqual([], h)

        self.assertIsInstance(g, DenseMatrix)

        self.assertIsInstance(u, DenseMatrix)
        self.assertIsInstance(v, DenseMatrix)
Esempio n. 26
0
 def _sparse_pinv(matrix_):
     # TODO: implement pinv
     return SparseMatrix(np.linalg.pinv(matrix_.mat.todense()))
Esempio n. 27
0
    def test_add_raises(self):
        test_cases = [(self.matrix_a, self.a),
                      (self.matrix_a, SparseMatrix(self.a))]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__add__, term2)
Esempio n. 28
0
def main():
    """
    Align two sparse matrices by intersecting their columns.
    """

    # Get the arguments
    args = docopt('''Align two sparse matrices by intersecting their columns.

    Usage:
        count_alignment_intersect.py [-l] <outPath1> <outPath2> <spacePrefix1> <spacePrefix2>

        <outPath1> = output path for aligned space 1
        <outPath2> = output path for aligned space 2
        <spacePrefix1> = path to pickled space1 without suffix
        <spacePrefix2> = path to pickled space2 without suffix

    Options:
        -l, --len   normalize final vectors to unit length
    
    ''')

    is_len = args['--len']
    spacePrefix1 = args['<spacePrefix1>']
    spacePrefix2 = args['<spacePrefix2>']
    outPath1 = args['<outPath1>']
    outPath2 = args['<outPath2>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    space1 = load_pkl_files(spacePrefix1)
    space2 = load_pkl_files(spacePrefix2)
    id2row1 = space1.get_id2row()
    id2row2 = space2.get_id2row()
    id2column1 = space1.get_id2column()
    id2column2 = space2.get_id2column()
    column2id1 = space1.get_column2id()
    column2id2 = space2.get_column2id()
    intersected_columns = list(set(id2column1).intersection(id2column2))
    intersected_columns_id1 = [
        column2id1[item] for item in intersected_columns
    ]
    intersected_columns_id2 = [
        column2id2[item] for item in intersected_columns
    ]
    reduced_matrix1 = space1.get_cooccurrence_matrix(
    )[:, intersected_columns_id1].get_mat()
    reduced_matrix2 = space2.get_cooccurrence_matrix(
    )[:, intersected_columns_id2].get_mat()

    if is_len:
        # L2-normalize vectors
        l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2)
        l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2)
        l2norm1[l2norm1 == 0.0] = 1.0  # Convert 0 values to 1
        l2norm2[l2norm2 == 0.0] = 1.0  # Convert 0 values to 1
        reduced_matrix1 /= l2norm1.reshape(len(l2norm1), 1)
        reduced_matrix2 /= l2norm2.reshape(len(l2norm2), 1)

    reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1,
                           intersected_columns)
    reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2,
                           intersected_columns)

    if reduced_space1.get_id2column() != reduced_space2.get_id2column():
        sys.exit('Two spaces not properly aligned!')

    # Save the Space object in pickle format
    save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True)
    save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 29
0
    def test_svd_raises(self):
        test_cases = [np.mat([[1, 2, 3], [2, 4, 6], [4, 675, 43]])]

        for x in test_cases:
            self.assertRaises(ValueError, Linalg.svd, DenseMatrix(x), 0)
            self.assertRaises(ValueError, Linalg.svd, SparseMatrix(x), 0)
    def test_svd(self):
        test_cases = [(DenseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
                       np.mat([[  2.19272110e+00,   3.03174768e+00, 0],
                               [  4.38544220e+00,   6.06349536e+00, 0],
                               [  6.76369708e+02,  -4.91431927e-02, 0]]),
                       np.mat([[0.0059,0.9979,0.0636],
                               [0.3255,-0.0621,0.9434],
                               [0.945,0.015,-0.325]]).transpose())]



        for x, us_expected, v_expected in test_cases:

            svd_red = Svd(2)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)

            svd_red = Svd(3)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)

            svd_red = Svd(6)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:2], 2)

            svd_red = Svd(1)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(us.mat, us_expected[:,0:1], 2)
            np.testing.assert_array_almost_equal(transmat.mat, v_expected[:,0:1], 2)


        test_cases = [(SparseMatrix(np.mat([[1,2,3],[2,4,6],[4,675,43]])),
                       np.mat([[  2.19272110e+00,   3.03174768e+00, 0],
                               [  4.38544220e+00,   6.06349536e+00, 0],
                               [  6.76369708e+02,  -4.91431927e-02, 0]]),
                       np.mat([[0.0059,0.9979,0.0636],
                               [0.3255,-0.0621,0.9434],
                               [0.945,0.015,-0.325]]).transpose())]


        for x, us_expected, v_expected in test_cases:
            us_expected = np.abs(us_expected)
            v_expected = np.abs(v_expected)

            svd_red = Svd(2)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)

            svd_red = Svd(3)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)

            svd_red = Svd(6)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:2], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:2], 2)

            svd_red = Svd(1)
            us, transmat = svd_red.apply(x)
            np.testing.assert_array_almost_equal(np.abs(us.mat.todense()), us_expected[:,0:1], 2)
            np.testing.assert_array_almost_equal(np.abs(transmat.mat.todense()), v_expected[:,0:1], 2)