def getMetaFeatures(questionPosts, answerPosts):
	dataByQuestion = {}
	for qPost in questionPosts:
		curr = nlp(BeautifulSoup(qPost['Body']).get_text())
		numChars = math.log(len(curr.text) + 1)
		numWords = math.log(len(list(curr)) + 1)
		numSents = math.log(len(list(curr.sents)) + 1)
		qWordSum = math.log(len([tok for tok in curr if tok.lemma_ in questionWords]) + 1)
		qWordData = [qWord in curr.text.lower() for qWord in questionWords]
		dataByQuestion[qPost['Id']] = [numChars, numWords, numSents, qWordSum] + qWordData + [curr]
	qMeta = []
	aMeta = []
	qaSim = []
	qVectors = []
	aVectors = []
	for aPost in answerPosts:
		curr = nlp(BeautifulSoup(aPost['Body']).get_text())
		qData = dataByQuestion[aPost['ParentId']]
		numChars = math.log(len(curr.text) + 1)
		numWords = math.log(len(list(curr)) + 1)
		numSents = math.log(len(list(curr.sents)) + 1)
		qMeta += [qData[:11]]
		aMeta += [[numChars, numWords, numSents]]
		qaSim += [[qData[11].similarity(curr)]]
		qVectors += [list(qData[11].vector)]
		aVectors += [list(curr.vector)]
	return sparse.csr_matrix(qMeta), sparse.csr_matrix(aMeta), sparse.csr_matrix(qaSim), sparse.csr_matrix(qVectors), sparse.csr_matrix(aVectors)
Example #2
0
def test_cs_graph_components():
    D = np.eye(4, dtype=np.bool)

    warn_ctx = WarningManager()
    warn_ctx.__enter__()
    try:
        warnings.filterwarnings("ignore",
                    message="`cs_graph_components` is deprecated")

        n_comp, flag = csgraph.cs_graph_components(csr_matrix(D))
        assert_(n_comp == 4)
        assert_equal(flag, [0, 1, 2, 3])

        D[0, 1] = D[1, 0] = 1

        n_comp, flag = csgraph.cs_graph_components(csr_matrix(D))
        assert_(n_comp == 3)
        assert_equal(flag, [0, 0, 1, 2])

        # A pathological case...
        D[2, 2] = 0
        n_comp, flag = csgraph.cs_graph_components(csr_matrix(D))
        assert_(n_comp == 2)
        assert_equal(flag, [0, 0, -2, 1])
    finally:
        warn_ctx.__exit__()
Example #3
0
def test_mkl_spsolve6():
    """
    MKL splu : Repeated RHS solve (Complex)
    """
    row = np.array([0,0,1,2,2,2])
    col = np.array([0,2,2,0,1,2])
    data = np.array([1,2,3,-4,5,6], dtype=complex)
    sM = sp.csr_matrix((data,(row,col)), shape=(3,3), dtype=complex)
    M = sM.toarray()

    row = np.array([0,0,1,1,0,0])
    col = np.array([0,2,1,1,0,0])
    data = np.array([1,1,1,1,1,1], dtype=complex)
    sN = sp.csr_matrix((data, (row,col)), shape=(3,3), dtype=complex)
    N = sN.toarray()

    sX = np.zeros((3,3),dtype=complex)
    lu = mkl_splu(sM)

    for k in range(3):
        sX[:,k] = lu.solve(N[:,k])
    lu.delete()
    
    X = la.solve(M,N)
    assert_array_almost_equal(X, sX)
 def setUp(self):
     self.term_map = {
         'animal': csr_matrix([0.1, 0.0, 0.3, 0.4]),
         'cat': csr_matrix([0.0, 0.5, 0.0, 1.0])
         }
     self.sim = SimCalculator()
     self.pair = ('cat', 'animal')
    def _transform_sparse(self, X):
        indices = X.indices.copy()
        indptr = X.indptr.copy()

        data_step = np.sqrt(X.data * self.sample_interval_)
        X_step = sp.csr_matrix((data_step, indices, indptr),
                               shape=X.shape, dtype=X.dtype, copy=False)
        X_new = [X_step]

        log_step_nz = self.sample_interval_ * np.log(X.data)
        step_nz = 2 * X.data * self.sample_interval_

        for j in range(1, self.sample_steps):
            factor_nz = np.sqrt(step_nz /
                                np.cosh(np.pi * j * self.sample_interval_))

            data_step = factor_nz * np.cos(j * log_step_nz)
            X_step = sp.csr_matrix((data_step, indices, indptr),
                                   shape=X.shape, dtype=X.dtype, copy=False)
            X_new.append(X_step)

            data_step = factor_nz * np.sin(j * log_step_nz)
            X_step = sp.csr_matrix((data_step, indices, indptr),
                                   shape=X.shape, dtype=X.dtype, copy=False)
            X_new.append(X_step)

        return sp.hstack(X_new)
Example #6
0
def die(first_noun, second_noun, trans_verb):
    """Vectorize a sentence with 'noun die noun verb' = (sub, obj)."""
    noun_model = space.words.polyglot_model()
    noun_space = noun_model[0]

    die_vector = compose.train.die_cat_stored()
    ver_vector = compose.train.verb(trans_verb, noun_model)

    fst_vector = noun_space[first_noun]
    snd_vector = noun_space[second_noun]

    par_vector_sub = kron(
        csr_matrix(snd_vector), csr_matrix(ver_vector))
    par_vector_obj = kron(
        csr_matrix(snd_vector), numpy.transpose(csr_matrix(ver_vector)))

    par_vector_sub = kron(
        numpy.transpose(csr_matrix(fst_vector)), csr_matrix(par_vector_sub))
    par_vector_obj = kron(
        numpy.transpose(csr_matrix(fst_vector)), csr_matrix(par_vector_obj))

    vector_sub = numpy.multiply(csr_matrix(die_vector), par_vector_sub)
    vector_obj = numpy.multiply(csr_matrix(die_vector), par_vector_obj)

    return (vector_sub.toarray().flatten(), vector_obj.toarray().flatten())
Example #7
0
    def test_sparse_concat(self):
        x_d = np.array([0, 7, 2, 3], dtype=np.float32)
        x_r = np.array([0, 2, 2, 3], dtype=np.int64)
        x_c = np.array([4, 3, 2, 3], dtype=np.int64)

        x_sparse_1 = sparse.csr_matrix((x_d, (x_r, x_c)), shape=(4, 5))

        x_d = np.array([0, 7, 2, 3], dtype=np.float32)
        x_r = np.array([0, 2, 2, 3], dtype=np.int64)
        x_c = np.array([4, 3, 2, 3], dtype=np.int64)

        x_sparse_2 = sparse.csr_matrix((x_d, (x_r, x_c)), shape=(4, 5))

        x_dense_1 = x_sparse_1.toarray()
        x_dense_2 = x_sparse_2.toarray()

        backends = [KTF]
        if KTH.th_sparse_module:
            # Theano has some dependency issues for sparse
            backends.append(KTH)

        for K in backends:
            k_s = K.concatenate([K.variable(x_sparse_1), K.variable(x_sparse_2)])
            assert K.is_sparse(k_s)

            k_s_d = K.eval(k_s)

            k_d = K.eval(K.concatenate([K.variable(x_dense_1), K.variable(x_dense_2)]))

            assert k_s_d.shape == k_d.shape
            assert_allclose(k_s_d, k_d, atol=1e-05)
Example #8
0
 def matrix_completion_task(self):
     X = sparse.csr_matrix(self._X * (self.descr["mask"] == 0))
     Y = sparse.csr_matrix(self._X * (self.descr["mask"] == 1))
     assert X.nnz == (self.descr["mask"] == 0).sum()
     assert Y.nnz == (self.descr["mask"] == 1).sum()
     # where mask is 2 is neither in X nor Y
     return X, Y
Example #9
0
def geometry(Nr,Nz,parms):

    r = np.linspace(-parms.Lr, parms.Lr, Nr+1)
    hr= r[1]-r[0]
    r = r[::-1]
    e = np.ones(Nr)

    Dr = (np.diag(e,-1) - np.diag(e,1))/(2*hr)
    Dr[0,0:2] = [1,-1]/hr
    Dr[Nr,Nr-1:Nr+1] = [1,-1]/hr

    Dr2 = (np.diag(e,-1) - 2*np.diag(np.ones(Nr+1),0) + np.diag(e,1))/hr**2
    Dr2[0,0:3] = [1,-2,1]/hr**2
    Dr2[Nr,Nr-2:Nr+1] = [1,-2,1]/hr**2

    z = np.linspace(-parms.Lz, 0, Nz)
    hz=z[1]-z[0]
    z = z[::-1]
    e = np.ones(Nz-1)

    Dz = (np.diag(e,-1) - np.diag(e,1))/(2*hz)
    Dz[0,0:3] = [-3,4,-1]/(2*hz)
    Dz[Nz-1,Nz-3:Nz] = [1,-4,3]/(2*hz)

    Dz2 = (np.diag(e,-1) - 2*np.diag(np.ones(Nz),0) + np.diag(e,1))/hz**2
    Dz2[0,0:3] = [1,-2,1]/hz**2
    Dz2[Nz-1,Nz-3:Nz] = [1,-2,1]/hz**2

    Dr = sp.csr_matrix(Dr); Dr2 = sp.csr_matrix(Dr2)
    Dz = sp.csr_matrix(Dz); Dz2 = sp.csr_matrix(Dz2)

    return [Dr,Dr2,r,Dz,Dz2,z]
Example #10
0
def test_ddp_sorting():
    beta = 0.95

    # Sorted
    s_indices = [0, 0, 1]
    a_indices = [0, 1, 0]
    a_indptr = [0, 2, 3]
    R = [0, 1, 2]
    Q = [(1, 0), (1/2, 1/2), (0, 1)]
    Q_sparse = sparse.csr_matrix(Q)

    # Shuffled
    s_indices_shuffled = [0, 1, 0]
    a_indices_shuffled = [0, 0, 1]
    R_shuffled = [0, 2, 1]
    Q_shuffled = [(1, 0), (0, 1), (1/2, 1/2)]
    Q_shuffled_sparse = sparse.csr_matrix(Q_shuffled)

    ddp0 = DiscreteDP(R, Q, beta, s_indices, a_indices)
    ddp_sparse = DiscreteDP(R, Q_sparse, beta, s_indices, a_indices)
    ddp_shuffled = DiscreteDP(R_shuffled, Q_shuffled, beta,
                              s_indices_shuffled, a_indices_shuffled)
    ddp_shuffled_sparse = DiscreteDP(R_shuffled, Q_shuffled_sparse, beta,
                                     s_indices_shuffled, a_indices_shuffled)

    for ddp in [ddp0, ddp_sparse, ddp_shuffled, ddp_shuffled_sparse]:
        assert_array_equal(ddp.s_indices, s_indices)
        assert_array_equal(ddp.a_indices, a_indices)
        assert_array_equal(ddp.a_indptr, a_indptr)
        assert_array_equal(ddp.R, R)
        if sparse.issparse(ddp.Q):
            ddp_Q = ddp.Q.toarray()
        else:
            ddp_Q = ddp.Q
        assert_array_equal(ddp_Q, Q)
def test_pairwise_kernels(metric):
    # Test the pairwise_kernels helper function.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((2, 4))
    function = PAIRWISE_KERNEL_FUNCTIONS[metric]
    # Test with Y=None
    K1 = pairwise_kernels(X, metric=metric)
    K2 = function(X)
    assert_array_almost_equal(K1, K2)
    # Test with Y=Y
    K1 = pairwise_kernels(X, Y=Y, metric=metric)
    K2 = function(X, Y=Y)
    assert_array_almost_equal(K1, K2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric)
    assert_array_almost_equal(K1, K2)

    # Test with sparse X and Y
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    if metric in ["chi2", "additive_chi2"]:
        # these don't support sparse matrices yet
        assert_raises(ValueError, pairwise_kernels,
                      X_sparse, Y=Y_sparse, metric=metric)
        return
    K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric)
    assert_array_almost_equal(K1, K2)
Example #12
0
def _steadystate_direct_sparse(L, verbose=False):
    """
    Direct solver that use scipy sparse matrices
    """
    if verbose:
        print('Starting direct solver...')

    n = prod(L.dims[0][0])
    b = sp.csr_matrix(([1.0], ([0], [0])), shape=(n ** 2, 1), dtype=complex)
    M = L.data + sp.csr_matrix((np.ones(n),
            (np.zeros(n), [nn * (n + 1) for nn in range(n)])),
            shape=(n ** 2, n ** 2))
    
    use_solver(assumeSortedIndices=True, useUmfpack=False)
    M.sort_indices()

    if verbose:
        start_time = time.time()
    # Do the actual solving here
    v = spsolve(M, b)

    if verbose:
        print('Direct solver time: ', time.time() - start_time)
    
    data = vec2mat(v)
    data = 0.5 * (data + data.conj().T)

    return Qobj(data, dims=L.dims[0], isherm=True)
Example #13
0
 def __init__(self, A, W, **kw):
     self.columns = kw.get('columns', np.arange(A.shape[1]))
     self.A = sparse.csr_matrix(A)
     self.W = sparse.csr_matrix(np.diag(W))
     self.O = self.A[:,self.columns].T*self.W*self.A
     self.O.data = np.log(self.O.data) + 1.
     self.O.data[np.isnan(self.O.data)] = 0.
def test_paired_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in iteritems(PAIRED_DISTANCES):
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        S3 = func(csr_matrix(X), csr_matrix(Y))
        assert_array_almost_equal(S, S3)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Check the callable implementation
    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y)
def test_euclidean_distances():
    # Check the pairwise Euclidean distances computation
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((20, 4))
    X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1)
    Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1)

    # check that we still get the right answers with {X,Y}_norm_squared
    D1 = euclidean_distances(X, Y)
    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
                             Y_norm_squared=Y_norm_sq)
    assert_array_almost_equal(D2, D1)
    assert_array_almost_equal(D3, D1)
    assert_array_almost_equal(D4, D1)

    # check we get the wrong answer with wrong {X,Y}_norm_squared
    X_norm_sq *= 0.5
    Y_norm_sq *= 0.5
    wrong_D = euclidean_distances(X, Y,
                                  X_norm_squared=np.zeros_like(X_norm_sq),
                                  Y_norm_squared=np.zeros_like(Y_norm_sq))
    assert_greater(np.max(np.abs(wrong_D - D1)), .01)
Example #16
0
    def text2spvec(self, query):
        """Create a sparse tfidf-weighted word vector from query.

        tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5))
        """
        # Get hashed ngrams
        words = self.parse(utils.normalize(query))
        wids = [utils.hash(w, self.hash_size) for w in words]

        if len(wids) == 0:
            if self.strict:
                raise RuntimeError('No valid word in: %s' % query)
            else:
                logger.warning('No valid word in: %s' % query)
                return sp.csr_matrix((1, self.hash_size))

        # Count TF
        wids_unique, wids_counts = np.unique(wids, return_counts=True)
        tfs = np.log1p(wids_counts)

        # Count IDF
        Ns = self.doc_freqs[wids_unique]
        idfs = np.log((self.num_docs - Ns + 0.5) / (Ns + 0.5))
        idfs[idfs < 0] = 0

        # TF-IDF
        data = np.multiply(tfs, idfs)

        # One row, sparse csr matrix
        indptr = np.array([0, len(wids_unique)])
        spvec = sp.csr_matrix(
            (data, wids_unique, indptr), shape=(1, self.hash_size)
        )

        return spvec
Example #17
0
    def predict(self, X):
        """ Predict values of X from internal dictionary and intercepts

        Parameters
        ----------
        X: csr-matrix (n_samples, n_features)
            Matrix holding the loci of prediction

        Returns
        -------
        X_pred: csr-matrix (n_samples, n_features)
            Matrix with the same sparsity structure as X, with predicted values
        """
        X = sp.csr_matrix(X)
        out = np.zeros_like(X.data)
        _predict(out, X.indices, X.indptr, self.P_,
                 self.Q_)

        if self.detrend:
            for i in range(X.shape[0]):
                out[X.indptr[i]:X.indptr[i + 1]] += self.row_mean_[i]
            out += self.col_mean_.take(X.indices, mode='clip')

        if self.crop is not None:
            out[out > self.crop[1]] = self.crop[1]
            out[out < self.crop[0]] = self.crop[0]

        return sp.csr_matrix((out, X.indices, X.indptr), shape=X.shape)
Example #18
0
def test_svd_matrix(W, WT, D, DT):
    Winv = ss.csr_matrix(np.linalg.pinv(W.todense()))
    WTinv = ss.csr_matrix(np.linalg.pinv(W.transpose().todense()))
#    A = np.dot(np.dot(Winv, D), WTinv)
    A = ((Winv * D) * WTinv)
    A = A.tocsc()
    res_dict = {}
    old_z = 0

    for k in range(270, 280):
        (ut, s, vt) = sparsesvd(A, k)
        U = ss.csr_matrix(ut.T)
        S = ss.csr_matrix(np.diag(s))
        V = ss.csr_matrix(vt)
        L = (W * U) * (S * V * WT.transpose())
        z = U.shape[1]

        if z == old_z:
            break

        else:
            Res = fnorm(L, DT)
            res_dict[z] = Res
            Result = OrderedDict(sorted(res_dict.items(),
                key=lambda t: np.float64(t[1])))
            old_z = z

    return Result
Example #19
0
def check_smw_solver(p, q, r, s):
    # Helper to check that _smw_solver results do in fact solve the desired
    # SMW equation
    d = q - r

    A = np.random.normal(size=(p, q))
    AtA = np.dot(A.T, A)

    B = np.zeros((q, q))
    B[0:r, 0:r] = np.random.normal(size=(r, r))
    di = np.random.uniform(size=d)
    B[r:q, r:q] = np.diag(1 / di)
    Qi = np.linalg.inv(B[0:r, 0:r])
    s = 0.5

    x = np.random.normal(size=p)
    y2 = np.linalg.solve(s * np.eye(p, p) + np.dot(A, np.dot(B, A.T)), x)

    f = _smw_solver(s, A, AtA, Qi, di)
    y1 = f(x)
    assert_allclose(y1, y2)

    f = _smw_solver(s, sparse.csr_matrix(A), sparse.csr_matrix(AtA), Qi,
                    di)
    y1 = f(x)
    assert_allclose(y1, y2)
Example #20
0
def test_unsorted_indices():
    # test that the result with sorted and unsorted indices in csr is the same
    # we use a subset of digits as iris, blobs or make_classification didn't
    # show the problem
    digits = load_digits()
    X, y = digits.data[:50], digits.target[:50]
    X_test = sparse.csr_matrix(digits.data[50:100])

    X_sparse = sparse.csr_matrix(X)
    coef_dense = svm.SVC(kernel='linear', probability=True,
                         random_state=0).fit(X, y).coef_
    sparse_svc = svm.SVC(kernel='linear', probability=True,
                         random_state=0).fit(X_sparse, y)
    coef_sorted = sparse_svc.coef_
    # make sure dense and sparse SVM give the same result
    assert_array_almost_equal(coef_dense, coef_sorted.toarray())

    X_sparse_unsorted = X_sparse[np.arange(X.shape[0])]
    X_test_unsorted = X_test[np.arange(X_test.shape[0])]

    # make sure we scramble the indices
    assert_false(X_sparse_unsorted.has_sorted_indices)
    assert_false(X_test_unsorted.has_sorted_indices)

    unsorted_svc = svm.SVC(kernel='linear', probability=True,
                           random_state=0).fit(X_sparse_unsorted, y)
    coef_unsorted = unsorted_svc.coef_
    # make sure unsorted indices give same result
    assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray())
    assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted),
                              sparse_svc.predict_proba(X_test))
Example #21
0
def check_matrix_and_delay(matrix=None, delay_matrix=None, source_size=None, target_size=None,
                           name='connection matrix'):
    if matrix is None:
        if delay_matrix is not None:
            raise ValueError('You cannot have a delay matrix without having a weight matrix')

        print('No %s found, making empty matrix' % name)
        matrix = spsp.csr_matrix((source_size, target_size))
    else:
        if source_size is not None:
            assert source_size == matrix.shape[0]
        else:
            source_size = matrix.shape[0]
        if target_size is not None:
            assert target_size == matrix.shape[1]
        else:
            target_size = matrix.shape[1]

    if delay_matrix is None:
        print('No delays of %s found, making empty matrix' % name)
        delay_matrix = spsp.csr_matrix((source_size, target_size))

    if spsp.issparse(delay_matrix):
        data = delay_matrix.data
    else:
        data = delay_matrix
    if len(data)> 0:
        max_delay = np.max(data)
    else:
        max_delay = 0.0

    return matrix, delay_matrix, max_delay
Example #22
0
def HiptmairMatrixSetup(mesh, N, M):

    path = os.path.abspath(os.path.join(inspect.getfile(inspect.currentframe()), ".."))
    if __version__ == '1.6.0':
        gradient_code = open(os.path.join(path, 'DiscreteGradientSecond.cpp'), 'r').read()
    else:
        gradient_code = open(os.path.join(path, 'DiscreteGradient.cpp'), 'r').read()
    compiled_gradient_module = compile_extension_module(code=gradient_code)

    column =  numpy.zeros(2*mesh.num_edges(), order="C") #, dtype="intc")
    row =  numpy.zeros(2*mesh.num_edges(), order="C") #, dtype="intc")
    data =  numpy.zeros(2*mesh.num_edges(), order="C") #, dtype="intc")

    dataX =  numpy.zeros(2*mesh.num_edges(), order="C")
    dataY =  numpy.zeros(2*mesh.num_edges(), order="C")
    dataZ =  numpy.zeros(2*mesh.num_edges(), order="C")

    tic()
    c = compiled_gradient_module.ProlongationGradsecond(mesh, dataX,dataY,dataZ, data, row, column)
    end = toc()
    MO.StrTimePrint("Data for C and P created, time: ",end)
    # print row
    # print column
    # print  data
    C = csr_matrix((data,(row,column)), shape=(N, M)).tocsr()
    Px = csr_matrix((dataX,(row,column)), shape=(N, M)).tocsr()
    Py = csr_matrix((dataY,(row,column)), shape=(N, M)).tocsr()
    Pz = csr_matrix((dataZ,(row,column)), shape=(N, M)).tocsr()
    return C, [Px,Py,Pz]
Example #23
0
	def __init__(self,dimension,alpha,lambda_,n,alpha_2, cluster_init="Complete"):
		self.time = 0
		#N_LinUCBAlgorithm.__init__(dimension = dimension, alpha=alpha,lambda_ = lambda_,n=n)
		self.users = []
		#algorithm have n users, each user has a user structure
		for i in range(n):
			self.users.append(CLUBUserStruct(dimension,lambda_, i)) 

		self.dimension = dimension
		self.alpha = alpha
		self.alpha_2 = alpha_2
		if (cluster_init=="Erdos-Renyi"):
			p = 3*math.log(n)/n
			self.Graph = np.random.choice([0, 1], size=(n,n), p=[1-p, p])
			self.clusters = []
			g = csr_matrix(self.Graph)
			N_components, components = connected_components(g)
		else:
			self.Graph = np.ones([n,n]) 
			self.clusters = []
			g = csr_matrix(self.Graph)
			N_components, components = connected_components(g)

		self.CanEstimateCoUserPreference = False
		self.CanEstimateUserPreference = False
		self.CanEstimateW = False
Example #24
0
def get_sij(rij):
    rji = rij.transpose()
    dij = csr_matrix(get_dij(rij))
    dji = csr_matrix(get_dij(rji))
    rijcsr = csr_matrix(rij)
    res = dij.dot(rijcsr).dot(dji)
    return res.todense()
def gen_app_pop_count(dev_app, ga_train, ga_test, base_dir='/data'):
    start_time = time.time()

    print('generating popularity weighted app count per device')

    app_popularity = dev_app.groupby(['app_id'])['device_id'].agg(
        {'popularity': lambda x: x.nunique()})
    app_pop_count = dev_app.groupby(['device_id'])['app_id'].agg(
        {'app_pop_count': lambda x: app_popularity.loc[x.unique(), 'popularity'].sum()})


    app_count_train = ga_train['device_id'].map(
        app_pop_count['app_pop_count']).fillna(0)
    app_count_train = app_count_train / app_count_train.max()

    app_count_train = csr_matrix(app_count_train.values).transpose()

    app_count_test = ga_test['device_id'].map(app_pop_count['app_pop_count']).fillna(0)
    app_count_test = app_count_test / app_count_test.max()

    app_count_test = csr_matrix(app_count_test.values).transpose()

    print('train set shape: ', app_count_train.shape)
    io.mmwrite(base_dir + "train_apppopcount.mtx", app_count_train)

    print('test set shape: ', app_count_test.shape)
    io.mmwrite(base_dir + "test_apppopcount.mtx", app_count_test)
    print('Time generating app pop count: ', (time.time() - start_time) / 60)
Example #26
0
def test_multiclass_to_ranking():
    X = sp.csr_matrix(np.arange(6).reshape((3,2)))
    y = sp.csr_matrix((3, 5))
    y[0, 0] = 1
    y[1, [2, 3]] = 1
    y[2, [0, 4]] = 1

    n_classes = y.shape[1]
    n_samples = X.shape[0]
    n_features = X.shape[1]

    X_ext, compars = multiclass_to_ranking(X, y)


    assert X_ext.shape[0] == n_classes * n_samples
    assert X_ext.shape[1] == n_classes + n_features

    # test that features are replicated
    assert_array_equal(X_ext.tocsc()[:, n_classes:].sum(axis=0),
            X.sum(axis=0) * n_classes)

    # test class labels encoding structure
    assert X_ext.tocsr()[:n_classes, :].sum(axis=0)[0, 0] == n_samples
    assert X_ext.tocsr()[:, :n_classes].sum() == n_samples * n_classes
    #assert_array_equal(X_ext.tocsc()[:, n_classes:].sum(axis=0),

    print X_ext.todense()
    print y.todense()
    print compars
Example #27
0
    def setUp(self):
        #      3---4
        #    / | / |
        #  0---1---2
        G0 = array([[0, 1, 0, 1, 0],
                    [1, 0, 1, 1, 1],
                    [0, 1, 0, 0, 1],
                    [1, 1, 0, 0, 1],
                    [0, 1, 1, 1, 0]])
        self.G0 = csr_matrix(G0)
        # make sure graph is symmetric
        assert_equal((self.G0 - self.G0.T).nnz, 0)

        #  2        5
        #  | \    / |
        #  0--1--3--4
        G1 = array([[0, 1, 1, 0, 0, 0],
                    [1, 0, 1, 1, 0, 0],
                    [1, 1, 0, 0, 0, 0],
                    [0, 1, 0, 0, 1, 1],
                    [0, 0, 0, 1, 0, 1],
                    [0, 0, 0, 1, 1, 0]])
        self.G1 = csr_matrix(G1)
        # make sure graph is symmetric
        assert_equal((self.G1 - self.G1.T).nnz, 0)
Example #28
0
def test_feature_inference_fails():

    # On predict if we try to use feature inference and supply
    # higher ids than the number of features that were supplied to fit
    # we should complain

    no_users, no_items = (10, 100)
    no_features = 20

    train = sp.coo_matrix((no_users,
                           no_items),
                          dtype=np.int32)

    user_features = sp.csr_matrix((no_users,
                                   no_features),
                                  dtype=np.int32)
    item_features = sp.csr_matrix((no_items,
                                   no_features),
                                  dtype=np.int32)
    model = LightFM()
    model.fit_partial(train,
                      user_features=user_features,
                      item_features=item_features)

    with pytest.raises(AssertionError):
        model.predict(np.array([no_features], dtype=np.int32),
                      np.array([no_features], dtype=np.int32))
    def test_sparse_quad_obj(self):
        times_dense = []
        times_sparse = []
        for n in [20, 200, 2000]:
            m1 = n/2
            A_sparse = 0.9
            data = generate_data(n=n, m1=m1, A_sparse=A_sparse)
            A, b, x_true = data['A'], data['b'], data['x_true']
            A_sparse = sps.csr_matrix(A)
            A_sparse_T = sps.csr_matrix(A.T)
            Q, c = construct_qp_from_least_squares(A, b)
            Q_sparse = sps.csr_matrix(Q)

            def obj_np(x, g):
                return quad_obj_np(x, Q, c, g)

            def obj_sparse(x, g):
                return sparse_least_squares_obj(x, A_sparse_T, A_sparse, b, g)

            g = np.zeros(n)
            start_time = time.time()
            obj_np(x_true, g)
            times_dense.append(time.time() - start_time)
            start_time = time.time()
            obj_sparse(x_true, g)
            times_sparse.append(time.time() - start_time)
        print 'times for sparse QP', times_sparse
        print 'times for dense QP', times_dense
Example #30
0
    def tidyup(self,atol=qset.auto_tidyup_atol):
        """Removes small elements from a quantum object.

        Parameters
        ----------
        atol : float 
            Absolute tolerance used by tidyup.  Default is set 
            via qutip global settings parameters.

        Returns
        -------
        oper: qobj
            Quantum object with small elements removed.
        
        """
        abs_data=abs(self.data.data.flatten())
        if any(abs_data):
            mx=max(abs_data)
            if mx>=1e-15:
                data=abs(self.data.data)
                outdata=self.data.copy()
                outdata.data[data<(atol*mx+np.finfo(float).eps)]=0
            else:
                outdata=sp.csr_matrix((self.shape[0],self.shape[1]),dtype=complex)
        else:
            outdata=sp.csr_matrix((self.shape[0],self.shape[1]),dtype=complex)
       
        outdata.eliminate_zeros()
        return Qobj(outdata,dims=self.dims,shape=self.shape,type=self.type,isherm=self.isherm)
Example #31
0
def links2vec(links,out_path,tmp_path,dim=100,cds=1.0,eig=0.5,verbose='none'):
    logger = logging.getLogger(__name__ + ".links2vec")
    #80204: Language Learning - Clustering pipeline January 2018.ipynb
    '''links => PMI'''
    #-cds = 1.0  # cds = float(args['--cds']) # Context distribution smoothing [default: 1.0]
    pmi_path = tmp_path + 'pmi'
    start = time.time()
    #-linkz = links.loc[(links['count'] > 2)]
    linkz = links
    words = linkz.groupby('word').sum().reset_index() \
        .sort_values(by=['count','word'], ascending=[False,True])
    contexts = linkz.groupby('link').sum().reset_index() \
        .sort_values(by=['count','link'], ascending=[False,True])
    # if verbose in ['max','debug']:
    #     print('Linkz:', len(linkz), 'items')
    #     with pd.option_context('display.max_rows', 6): print(linkz)
    #     print('words:', len(words), 'items')
    #     with pd.option_context('display.max_rows', 6): print(words,'\n')
    #     print('contexts:', len(contexts), 'items')
    #     with pd.option_context('display.max_rows', 6): print(contexts)
    logger.info(f'Linkz: {len(linkz)} items')
    with pd.option_context('display.max_rows', 6): logger.info(f"{linkz}")
    logger.info(f'words: {len(words)} items')
    with pd.option_context('display.max_rows', 6): logger.info(f'{words}\n')
    logger.info(f'contexts: {len(contexts)} items')
    with pd.option_context('display.max_rows', 6): logger.info(f"{contexts}")

    iw = sorted(words['word'].drop_duplicates().values.tolist())
    ic = sorted(contexts['link'].drop_duplicates().values.tolist())
    wi = dict([(w, i) for i, w in enumerate(iw)])
    ci = dict([(c, i) for i, c in enumerate(ic)])
    counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
    update_threshold = 100000   # ~ batch size
    i = 0
    for row in linkz.itertuples():
        if row.word in wi and row.link in ci:
            tmp_counts[wi[row.word], ci[row.link]] = int(row.count)
        i += 1
        if i == update_threshold:
            counts = counts + tmp_counts.tocsr()
            tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
            i = 0
    counts = counts + tmp_counts.tocsr()
    list2tsv(iw, pmi_path + '.words.vocab')     # any need to save?
    list2tsv(ic, pmi_path + '.contexts.vocab')
    # if verbose in ['max','debug']: print('PMI data saved to', pmi_path)
    logger.info(f'PMI data saved to {pmi_path}')

    pmi = calc_pmi(counts, cds)
    np.savez_compressed(pmi_path, \
        data=pmi.data, indices=pmi.indices, indptr=pmi.indptr, shape=pmi.shape)
    # if verbose in ['max','debug']:
    #   print('PMI matrix', type(pmi), pmi.shape, '\nsaved to', pmi_path)
    logger.info(f'PMI matrix {type(pmi)}, {pmi.shape}\nsaved to {pmi_path}')

    '''PMI => SVD'''
    svd_path = pmi_path[:-3] + 'svd'
    neg = 1     # int(args['--neg'])  Number of negative samples;
                # [default: 1]        subtracts its log from PMI
    # if verbose in ['max','debug']:
    #   print('SVD started: dim', dim, ', output:', svd_path+'...')
    logger.info(f'SVD started: dim {dim}, output: {svd_path}...')

    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)
    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)
    np.save(svd_path + '.ut.npy', ut)
    np.save(svd_path + '.s.npy', s)
    np.save(svd_path + '.vt.npy', vt)
    list2tsv(explicit.iw, svd_path + '.words.vocab')  # any need to save?
    list2tsv(explicit.ic, svd_path + '.contexts.vocab')
    # if verbose in ['max','debug']:
    #     print('SVD matrix (3 files .npy) saved:', len(ut[0]), 'vectors, ', \
    #         'ut:', len(ut), 's:', len(s), 'vt:', len(vt))
    logger.info(f'SVD matrix (3 files .npy) saved: {len(ut[0])} vectors, ut: {len(ut)}, s: {len(s)}, vt: {len(vt)}')

    '''SVD => vectors.txt'''
    out_file = out_path + 'vectors.txt'
    svd = SVDEmbedding(svd_path, True, eig)
    with open(out_file, 'w') as file:
        for i, w in enumerate(svd.iw):
            file.write(w+' '+(' '.join([str(x) for x in svd.m[i]]))+'\n')
    readme_path = out_path + 'vectors_readme.txt'
    readme = 'Word vectors: dimension '+str(dim)+', '+str(len(svd.iw))+' vectors'
    with open(readme_path, 'w') as f: f.write(readme)
    # if verbose != 'none':
    #     print('vectors saved to\n', out_file, \
    #         '- elapsed', int(round(time.time() - start, 0)), 's ~', \
    #       round((time.time() - start)/len(ut[0])*1000, 3), 'ms/vector')
    logger.warning(f'vectors saved to\n {out_file} - elapsed {int(round(time.time() - start, 0))} s ~ '
                   f'{round((time.time() - start)/len(ut[0])*1000, 3)} ms/vector')

    response = {'vectors_file': out_file}
    return response
def load_sparse_csr(filename):
  loader = np.load(filename)
  return sparse.csr_matrix(
      (loader["data"], loader["indices"], loader["indptr"]),
      shape=loader["shape"],
      dtype=np.float32)
def convert_X(mat):
    mat = csr_matrix(mat)
    return mat
Example #34
0
 def represent(self, w):
     if w in self.wi: return self.m[self.wi[w], :]
     else: return csr_matrix((1, len(self.ic)))
Example #35
0
def load_matrix(f):
    if not f.endswith('.npz'):
        f += '.npz'
    loader = np.load(f)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
Example #36
0
def pmisvd(links,path,tmpath, dim=100, cds=1.0, eig=0.5, neg=1, verbose='none'):
    logger = logging.getLogger(__name__ + ".pmisvd")
    '''80223 epmisvd enhanced: return +singular values'''
    # path - dir to save vectors.txt and readme
    # path - dir to save temporary files
    # cds = 1.0 # context distribution smoothing [default: 1.0]
    # eig = 0.5 # weighted exponent of the eigenvalue matrix [default: 0.5]
    # neg = 1   # Number of negative samples; [default: 1] subtracts its log from PMI
                # PMI => SVD PositiveExplicit parameter
    if tmpath[-1] == '/': tmpath = tmpath[:-1]
    if path[-1] == '/': path = path[:-1]

    '''links => PMI'''
    pmi_path = tmpath + '/pmi'
    start = time.time()
    #-linkz = links.loc[(links['count'] > 2)]
    linkz = links
    words = linkz.groupby('word').sum().reset_index()\
        .sort_values(by=['count','word'], ascending=[False,True])
    contexts = linkz.groupby('link').sum().reset_index()\
        .sort_values(by=['count','link'], ascending=[False,True])

    iw = sorted(words['word'].drop_duplicates().values.tolist())
    ic = sorted(contexts['link'].drop_duplicates().values.tolist())
    wi = dict([(w, i) for i, w in enumerate(iw)])
    ci = dict([(c, i) for i, c in enumerate(ic)])
    counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
    update_threshold = 100000   # ~ batch size
    i = 0
    for row in linkz.itertuples():
        if row.word in wi and row.link in ci:
            tmp_counts[wi[row.word], ci[row.link]] = int(row.count)
        i += 1
        if i == update_threshold:
            counts = counts + tmp_counts.tocsr()
            tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
            i = 0
    counts = counts + tmp_counts.tocsr()
    list2tsv(iw, pmi_path + '.words.vocab')  # any need to save?
    list2tsv(ic, pmi_path + '.contexts.vocab')

    '''counts + vocab => pmi'''
    pmi = calc_pmi(counts, cds)
    np.savez_compressed(pmi_path, \
        data=pmi.data, indices=pmi.indices, indptr=pmi.indptr, shape=pmi.shape)

    '''PMI => SVD'''
    svd_path = pmi_path[:-3] + 'svd'
    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)
    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)
    np.save(svd_path + '.ut.npy', ut)
    np.save(svd_path + '.s.npy', s)
    np.save(svd_path + '.vt.npy', vt)
    list2tsv(explicit.iw, svd_path + '.words.vocab')  # any need to save?
    list2tsv(explicit.ic, svd_path + '.contexts.vocab')

    '''SVD => vectors.txt'''
    svd = SVDEmbedding(svd_path, True, eig)   # TODO: move code here, RAM2RAM
    if len(svd.m[0]) < dim: dim = len(svd.m[0])   # 80216
    vectors_df = pd.DataFrame(columns=['word'] + list(range(1,dim+1)))
    for i, w in enumerate(svd.iw):
        vectors_df.loc[i] = [w] + svd.m[i].tolist()
    out_file = path + '/vectors.txt'
    with open(out_file, 'w') as file:
        for i, w in enumerate(svd.iw):
            file.write(w+' '+(' '.join([str(x) for x in svd.m[i]]))+'\n')
    readme_path = path + '/vectors_readme.txt'
    readme = 'Word vectors: dimension '+str(dim)+', '+str(len(svd.iw))+' vectors'
    with open(readme_path, 'w') as f: f.write(readme)

    singular_values = s.tolist()  # type(s): numpy.ndarray
    return vectors_df, singular_values, {'vectors_file': out_file}
Example #37
0
def epmisvd(links,path,tmpath,dim=100,cds=1.0,eig=0.5,neg=1,verbose='none'):
    logger = logging.getLogger(__name__ + ".epmisvd")
    # cds = 1.0 # context distribution smoothing [default: 1.0]
    # eig = 0.5 # weighted exponent of the eigenvalue matrix [default: 0.5]
    # neg = 1   # Number of negative samples; [default: 1] subtracts its log from PMI
                # PMI => SVD PositiveExplicit parameter
    '''links => PMI'''
    pmi_path = tmpath + 'pmi'
    start = time.time()
    #-linkz = links.loc[(links['count'] > 2)]
    linkz = links
    words = linkz.groupby('word').sum().reset_index()\
        .sort_values(by=['count','word'], ascending=[False,True])
    contexts = linkz.groupby('link').sum().reset_index()\
        .sort_values(by=['count','link'], ascending=[False,True])
    # if verbose in ['max','debug']:
    #     print('Linkz:', len(linkz), 'items')
    #     with pd.option_context('display.max_rows', 6): print(linkz)
    #     print('words:', len(words), 'items')
    #     with pd.option_context('display.max_rows', 6): print(words,'\n')
    #     print('contexts:', len(contexts), 'items')
    #     with pd.option_context('display.max_rows', 6): print(contexts)
    logger.info(f'Linkz: {len(linkz)} items')
    with pd.option_context('display.max_rows', 6): logger.info(f'{linkz}')
    logger.info(f'words: {len(words)} items')
    with pd.option_context('display.max_rows', 6): logger.info(f'{words}\n')
    logger.info(f'contexts: {len(contexts)} items')
    with pd.option_context('display.max_rows', 6): logger.info(f'{contexts}')

    iw = sorted(words['word'].drop_duplicates().values.tolist())
    ic = sorted(contexts['link'].drop_duplicates().values.tolist())
    wi = dict([(w, i) for i, w in enumerate(iw)])
    ci = dict([(c, i) for i, c in enumerate(ic)])
    counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
    update_threshold = 100000   # ~ batch size
    i = 0
    for row in linkz.itertuples():
        if row.word in wi and row.link in ci:
            tmp_counts[wi[row.word], ci[row.link]] = int(row.count)
        i += 1
        if i == update_threshold:
            counts = counts + tmp_counts.tocsr()
            tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
            i = 0
    counts = counts + tmp_counts.tocsr()
    list2tsv(iw, pmi_path + '.words.vocab')  # any need to save?
    list2tsv(ic, pmi_path + '.contexts.vocab')
    # if verbose in ['max','debug']: print('PMI data saved to', pmi_path)
    logger.info('PMI data saved to' + pmi_path)

    '''counts + vocab => pmi'''
    pmi = calc_pmi(counts, cds)
    np.savez_compressed(pmi_path, \
        data=pmi.data, indices=pmi.indices, indptr=pmi.indptr, shape=pmi.shape)
    # if verbose in ['max','debug']:
    #   print('PMI matrix', type(pmi), pmi.shape, '\nsaved to', pmi_path)
    logger.info(f'PMI matrix {type(pmi)} {pmi.shape}\nsaved to {pmi_path}')

    '''PMI => SVD'''
    svd_path = pmi_path[:-3] + 'svd'
    # if verbose in ['max','debug']:
    #     print('SVD started: dim', dim, ', output:', svd_path+'...')
    logger.info(f'SVD started: dim {dim}, output: {svd_path}...')

    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)
    #print('explicit.m:', explicit.m)
    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)
    np.save(svd_path + '.ut.npy', ut)
    np.save(svd_path + '.s.npy', s)
    np.save(svd_path + '.vt.npy', vt)
    list2tsv(explicit.iw, svd_path + '.words.vocab')  # any need to save?
    list2tsv(explicit.ic, svd_path + '.contexts.vocab')
    # if verbose in ['max','debug']:
    #     print('SVD matrix (3 files .npy) saved:', len(ut[0]), 'vectors, ', \
    #           'ut:', len(ut), 's:', len(s), 'vt:', len(vt))
    logger.info(f'SVD matrix (3 files .npy) saved: {len(ut[0])} vectors, ut: {len(ut)} s: {len(s)} vt:{len(vt)}')

    '''SVD => vectors.txt'''
    svd = SVDEmbedding(svd_path, True, eig)   # TODO: move code here, RAM2RAM
    if len(svd.m[0]) < dim: dim = len(svd.m[0])   # 80216
    vectors_df = pd.DataFrame(columns=['word'] + list(range(1,dim+1)))
    for i, w in enumerate(svd.iw):
        vectors_df.loc[i] = [w] + svd.m[i].tolist()
    out_file = path + 'vectors.txt'
    with open(out_file, 'w') as file:
        for i, w in enumerate(svd.iw):
            file.write(w+' '+(' '.join([str(x) for x in svd.m[i]]))+'\n')
    readme_path = path + 'vectors_readme.txt'
    readme = 'Word vectors: dimension '+str(dim)+', '+str(len(svd.iw))+' vectors'
    with open(readme_path, 'w') as f: f.write(readme)
    # if verbose in ['max','debug']:
    #     print('vectors saved to\n', out_file, \
    #         '- elapsed', int(round(time.time() - start, 0)), 's ~', \
    #       round((time.time() - start)/len(ut[0])*1000, 3), 'ms/vector')
    logger.info(f'vectors saved to\n {out_file} - elapsed {int(round(time.time() - start, 0))} s ~ '
                f'{round((time.time() - start)/len(ut[0])*1000, 3)} ms/vector')

    response = {'vectors_file': out_file}
    return vectors_df, response
import numpy as np
import scipy.sparse as sps
from External_Libraries.Notebooks_utils.data_splitter import train_test_holdout
from External_Libraries.Similarity.Compute_Similarity_Python import Compute_Similarity_Python
from External_Libraries.Notebooks_utils.evaluation_function import evaluate_algorithm
import matplotlib.pyplot as pyplot
from External_Libraries.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from External_Libraries.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from External_Libraries.ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt
from External_Libraries.ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
from External_Libraries.DataIO import DataIO
from External_Libraries.Base.Recommender_utils import check_matrix
from External_Libraries.Base.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from External_Libraries.SLIM_ElasticNet.SLIMElasticNetRecommender import SLIMElasticNetRecommender

URM_all = sps.csr_matrix(sps.load_npz("../../Dataset/old/data_all.npz"))
URM_train = sps.csr_matrix(sps.load_npz("../../Dataset/old/data_train.npz"))
URM_test = sps.csr_matrix(sps.load_npz("../../Dataset/old/data_test.npz"))

class ItemKNNScoresHybridRecommender(BaseItemSimilarityMatrixRecommender):

    RECOMMENDER_NAME = "ItemKNNScoresHybridRecommender"

    def __init__(self, URM_train, Recommender_1, Recommender_2):
        super(ItemKNNScoresHybridRecommender, self).__init__(URM_train)

        self.URM_train = check_matrix(URM_train.copy(), 'csr')
        self.Recommender_1 = Recommender_1
        self.Recommender_2 = Recommender_2

    def fit(self, alpha):
Example #39
0
    def testSolveTriangular(self):
        from mars.tensor import tril, triu
        np.random.seed(1)

        data1 = np.random.randint(1, 10, (20, 20))
        data2 = np.random.randint(1, 10, (20, ))

        A = tensor(data1, chunk_size=20)
        b = tensor(data2, chunk_size=20)

        x = solve_triangular(A, b)
        t = triu(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        x = solve_triangular(A, b, lower=True)
        t = tril(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        A = tensor(data1, chunk_size=10)
        b = tensor(data2, chunk_size=10)

        x = solve_triangular(A, b)
        t = triu(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        x = solve_triangular(A, b, lower=True)
        t = tril(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        data1 = np.random.randint(1, 10, (10, 10))
        data2 = np.random.randint(1, 10, (10, 5))

        A = tensor(data1, chunk_size=10)
        b = tensor(data2, chunk_size=10)

        x = solve_triangular(A, b)
        t = triu(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        x = solve_triangular(A, b, lower=True)
        t = tril(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        A = tensor(data1, chunk_size=3)
        b = tensor(data2, chunk_size=3)

        x = solve_triangular(A, b)
        t = triu(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        x = solve_triangular(A, b, lower=True)
        t = tril(A).dot(x)

        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data2)

        # test sparse
        data1 = sps.csr_matrix(np.triu(np.random.randint(1, 10, (10, 10))))
        data2 = np.random.random((10, ))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve_triangular(A, b)

        result_x = self.executor.execute_tensor(x, concat=True)[0]
        result_b = data1.dot(result_x)

        self.assertIsInstance(result_x, SparseNDArray)
        np.testing.assert_allclose(result_b, data2)

        data1 = sps.csr_matrix(np.triu(np.random.randint(1, 10, (10, 10))))
        data2 = np.random.random((10, 2))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve_triangular(A, b)

        result_x = self.executor.execute_tensor(x, concat=True)[0]
        result_b = data1.dot(result_x)

        self.assertIsInstance(result_x, SparseNDArray)
        np.testing.assert_allclose(result_b, data2)
Example #40
0
def test_check_array():
    # accept_sparse == False
    # raise error on sparse inputs
    X = [[1, 2], [3, 4]]
    X_csr = sp.csr_matrix(X)
    with pytest.raises(TypeError):
        check_array(X_csr)

    # ensure_2d=False
    X_array = check_array([0, 1, 2], ensure_2d=False)
    assert X_array.ndim == 1
    # ensure_2d=True with 1d array
    with pytest.raises(ValueError, match="Expected 2D array,"
                                         " got 1D array instead"):
        check_array([0, 1, 2], ensure_2d=True)

    # ensure_2d=True with scalar array
    with pytest.raises(ValueError, match="Expected 2D array,"
                                         " got scalar array instead"):
        check_array(10, ensure_2d=True)

    # don't allow ndim > 3
    X_ndim = np.arange(8).reshape(2, 2, 2)
    with pytest.raises(ValueError):
        check_array(X_ndim)
    check_array(X_ndim, allow_nd=True)  # doesn't raise

    # dtype and order enforcement.
    X_C = np.arange(4).reshape(2, 2).copy("C")
    X_F = X_C.copy("F")
    X_int = X_C.astype(int)
    X_float = X_C.astype(float)
    Xs = [X_C, X_F, X_int, X_float]
    dtypes = [np.int32, int, float, np.float32, None, bool, object]
    orders = ['C', 'F', None]
    copys = [True, False]

    for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
        X_checked = check_array(X, dtype=dtype, order=order, copy=copy)
        if dtype is not None:
            assert X_checked.dtype == dtype
        else:
            assert X_checked.dtype == X.dtype
        if order == 'C':
            assert X_checked.flags['C_CONTIGUOUS']
            assert not X_checked.flags['F_CONTIGUOUS']
        elif order == 'F':
            assert X_checked.flags['F_CONTIGUOUS']
            assert not X_checked.flags['C_CONTIGUOUS']
        if copy:
            assert X is not X_checked
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and
                    X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS']
                    and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']):
                assert X is X_checked

    # allowed sparse != None
    X_csc = sp.csc_matrix(X_C)
    X_coo = X_csc.tocoo()
    X_dok = X_csc.todok()
    X_int = X_csc.astype(int)
    X_float = X_csc.astype(float)

    Xs = [X_csc, X_coo, X_dok, X_int, X_float]
    accept_sparses = [['csr', 'coo'], ['coo', 'dok']]
    for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses,
                                                 copys):
        with warnings.catch_warnings(record=True) as w:
            X_checked = check_array(X, dtype=dtype,
                                    accept_sparse=accept_sparse, copy=copy)
        if (dtype is object or sp.isspmatrix_dok(X)) and len(w):
            # XXX unreached code as of v0.22
            message = str(w[0].message)
            messages = ["object dtype is not supported by sparse matrices",
                        "Can't check dok sparse matrix for nan or inf."]
            assert message in messages
        else:
            assert len(w) == 0
        if dtype is not None:
            assert X_checked.dtype == dtype
        else:
            assert X_checked.dtype == X.dtype
        if X.format in accept_sparse:
            # no change if allowed
            assert X.format == X_checked.format
        else:
            # got converted
            assert X_checked.format == accept_sparse[0]
        if copy:
            assert X is not X_checked
        else:
            # doesn't copy if it was already good
            if X.dtype == X_checked.dtype and X.format == X_checked.format:
                assert X is X_checked

    # other input formats
    # convert lists to arrays
    X_dense = check_array([[1, 2], [3, 4]])
    assert isinstance(X_dense, np.ndarray)
    # raise on too deep lists
    with pytest.raises(ValueError):
        check_array(X_ndim.tolist())
    check_array(X_ndim.tolist(), allow_nd=True)  # doesn't raise

    # convert weird stuff to arrays
    X_no_array = _NotAnArray(X_dense)
    result = check_array(X_no_array)
    assert isinstance(result, np.ndarray)
def test_variance_threshold():
    """Test VarianceThreshold with custom variance."""
    for X in [data, csr_matrix(data)]:
        X = VarianceThreshold(threshold=.4).fit_transform(X)
        assert_equal((len(data), 1), X.shape)
Example #42
0
    def testSolve(self):
        import scipy.linalg
        np.random.seed(1)

        data1 = np.random.randint(1, 10, (20, 20))
        data2 = np.random.randint(1, 10, (20, ))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve(A, b)

        res = self.executor.execute_tensor(x, concat=True)[0]
        np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        np.testing.assert_allclose(res, data2)

        data2 = np.random.randint(1, 10, (20, 5))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve(A, b)

        res = self.executor.execute_tensor(x, concat=True)[0]
        np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        np.testing.assert_allclose(res, data2)

        data2 = np.random.randint(1, 10, (20, 20))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve(A, b)

        res = self.executor.execute_tensor(x, concat=True)[0]
        np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        np.testing.assert_allclose(res, data2)

        # test for not all chunks are square in matrix A
        data2 = np.random.randint(1, 10, (20, ))

        A = tensor(data1, chunk_size=6)
        b = tensor(data2, chunk_size=6)

        x = solve(A, b)

        res = self.executor.execute_tensor(x, concat=True)[0]
        np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        np.testing.assert_allclose(res, data2)

        A = tensor(data1, chunk_size=(7, 6))
        b = tensor(data2, chunk_size=6)

        x = solve(A, b)

        res = self.executor.execute_tensor(x, concat=True)[0]
        np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2))
        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        np.testing.assert_allclose(res, data2)

        # test sparse
        data1 = sps.csr_matrix(np.random.randint(1, 10, (20, 20)))
        data2 = np.random.randint(1, 10, (20, ))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve(A, b)

        res = self.executor.execute_tensor(x, concat=True)[0]
        self.assertIsInstance(res, SparseNDArray)
        np.testing.assert_allclose(data1.dot(res), data2)

        data2 = np.random.randint(1, 10, (20, 5))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve(A, b)

        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        self.assertIsInstance(res, SparseNDArray)
        np.testing.assert_allclose(res, data2)

        data2 = np.random.randint(1, 10, (20, 20))

        A = tensor(data1, chunk_size=5)
        b = tensor(data2, chunk_size=5)

        x = solve(A, b)

        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        self.assertIsInstance(res, SparseNDArray)
        np.testing.assert_allclose(res, data2)

        # test for not all chunks are square in matrix A
        data2 = np.random.randint(1, 10, (20, ))

        A = tensor(data1, chunk_size=6)
        b = tensor(data2, chunk_size=6)

        x = solve(A, b)

        res = self.executor.execute_tensor(A.dot(x), concat=True)[0]
        np.testing.assert_allclose(res, data2)
def _read_facts(fact_file, relation_embeddings, question_embedding,
                seeds, qId):
    """Read all triples from the fact file and create a sparse adjacency
    matrix between the entities. Returns mapping of entities to their
    indices, a mapping of relations to the
    and the combined adjacency matrix."""
    seeds_found = set()
    with open(fact_file) as f:
        entity_map = {}
        relation_map = {}
        all_row_ones, all_col_ones = [], []
        num_entities = 0
        num_facts = 0
        for line in f:
            try:
                e1, rel, e2 = line.strip().split(None, 2)
            except ValueError:
                continue
            if _filter_relation(rel): continue
            if e1 not in entity_map:
                entity_map[e1] = num_entities
                num_entities += 1
            if e2 not in entity_map:
                entity_map[e2] = num_entities
                num_entities += 1
            if rel not in relation_map:
                relation_map[rel] = [[], []]
            if e1 in seeds: seeds_found.add(e1)
            if e2 in seeds: seeds_found.add(e2)
            all_row_ones.append(entity_map[e1])
            all_col_ones.append(entity_map[e2])
            all_row_ones.append(entity_map[e2])
            all_col_ones.append(entity_map[e1])
            relation_map[rel][0].append(entity_map[e1])
            relation_map[rel][1].append(entity_map[e2])
            num_facts += 1
            if num_facts == MAX_FACTS:
                break
    if not relation_map:
        return {}, {}, None
    for rel in relation_map:
        row_ones, col_ones = relation_map[rel]
        m = csr_matrix(
            (np.ones((len(row_ones),)), (np.array(row_ones), np.array(col_ones))),
            shape=(num_entities, num_entities))
        relation_map[rel] = normalize(m, norm="l1", axis=1)
        if RELATION_WEIGHTING:
            if rel not in relation_embeddings:
                score = NOTFOUNDSCORE
            else:
                score = np.dot(question_embedding, relation_embeddings[rel]) / (
                        np.linalg.norm(question_embedding) *
                        np.linalg.norm(relation_embeddings[rel]))
            relation_map[rel] = relation_map[rel] * np.power(score, EXPONENT)
    if DECOMPOSE_PPV:
        adj_mat = sum(relation_map.values()) / len(relation_map)
    else:
        adj_mat = csr_matrix(
            (np.ones((len(all_row_ones),)), (np.array(all_row_ones), np.array(all_col_ones))),
            shape=(num_entities, num_entities))
    return entity_map, relation_map, normalize(adj_mat, norm="l1", axis=1)
Example #44
0
    def testLUExecution(self):
        np.random.seed(1)

        data = np.random.randint(1, 10, (6, 6))

        a = tensor(data)
        P, L, U = lu(a)

        # check lower and upper triangular matrix
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data)

        a = tensor(data, chunk_size=2)
        P, L, U = lu(a)

        # check lower and upper triangular matrix
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data)

        a = tensor(data, chunk_size=(2, 3))
        P, L, U = lu(a)

        # check lower and upper triangular matrix
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data)

        a = tensor(data, chunk_size=4)
        P, L, U = lu(a)

        # check lower and upper triangular matrix
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_allclose(res, data)

        # test for sparse
        data = sps.csr_matrix([[2, 0, 0, 0, 5, 2], [0, 6, 1, 0, 0, 6],
                               [8, 0, 9, 0, 0, 2], [0, 6, 0, 8, 7, 3],
                               [7, 0, 6, 1, 7, 0], [0, 0, 0, 7, 0, 8]])

        a = tensor(data)
        P, L, U = lu(a)
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        # check lower and upper triangular matrix
        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)
        self.assertIsInstance(result_l, SparseNDArray)
        self.assertIsInstance(result_u, SparseNDArray)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_array_almost_equal(data.A, res)

        a = tensor(data, chunk_size=2)
        P, L, U = lu(a)
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        # check lower and upper triangular matrix
        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)
        self.assertIsInstance(result_l, SparseNDArray)
        self.assertIsInstance(result_u, SparseNDArray)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_array_almost_equal(data.A, res)

        a = tensor(data, chunk_size=(2, 3))
        P, L, U = lu(a)
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        # check lower and upper triangular matrix
        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)
        self.assertIsInstance(result_l, SparseNDArray)
        self.assertIsInstance(result_u, SparseNDArray)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_array_almost_equal(data.A, res)

        a = tensor(data, chunk_size=4)
        P, L, U = lu(a)
        result_l = self.executor.execute_tensor(L, concat=True)[0]
        result_u = self.executor.execute_tensor(U, concat=True)[0]

        # check lower and upper triangular matrix
        np.testing.assert_allclose(np.tril(result_l), result_l)
        np.testing.assert_allclose(np.triu(result_u), result_u)
        self.assertIsInstance(result_l, SparseNDArray)
        self.assertIsInstance(result_u, SparseNDArray)

        t = P.dot(L).dot(U)
        res = self.executor.execute_tensor(t, concat=True)[0]
        np.testing.assert_array_almost_equal(data.A, res)
Example #45
0
 def __init__(self, A):
     if isinstance(A, str):
         super().__init__(A)
     else:
         A = csr_matrix(A)
         super().__init__(A.indptr, A.indices, A.data.astype('uint32'), A.get_shape()[1])
Example #46
0
    def eval(self):
        if TIME: start = timer()
        
        # get held values if they exist
        if self.buffer == True and not self.disp == "":
            load = json.loads(self.disp)
            U = np.array(load)
            return U

        # set input sockets
        input_socket_1 = self.inputs[0]
        input_socket_2 = self.inputs[1]
        input_socket_3 = self.inputs[2]
        input_socket_4 = self.inputs[3]
        input_socket_5 = self.inputs[4]
        input_socket_6 = self.inputs[5]
        input_socket_7 = self.inputs[6]
        input_socket_8 = self.inputs[7]

        input_socket_1.set_value(self.E)
        input_socket_2.set_value(self.v)

        # get inputs from previous nodes
        # print("!!!")
        # print(self.material_input)
        if self.material_input == "VALUE":
            E = self.get_value(input_socket_1)
            v = self.get_value(input_socket_2)
            
        elif self.material_input == "NODE":
            # print("NODE")
            mat_vect = self.get_value(input_socket_3)
            # print(mat_vect)
            E = mat_vect[0]
            v = mat_vect[2]

        if self.size_input == "VALUE":
            input_socket_4.set_value(self.t)
            t = self.get_value(input_socket_4)
        else:
            t = self.get_value(input_socket_5)


                
        self.object = self.get_value(input_socket_6)
        object = self.object
        ob = object.data        

        # create bmesh environment
        bm = bmesh.new()
        bm.from_mesh(self.object.data)

        # create a matrix of values for edges
        #new row 1
            # column 0 = element number
            # column 1 = node 1
            # column 2 = node 2
        # new row 2
            # column 0 = modulus of elasticity
            # column 1 = area
            # column 2 = G
            # column 3 = y moment of inertia
            # column 4 = z moment of inertia
            # column 5 = J
        edge_matrix = np.zeros((3), dtype=int)
        properties = [0,0,0,0,0,0]
        coormat = np.zeros((9))
        new_row_1 = np.zeros((3), dtype=int)
        new_row_2 = properties
        i = 0
        G = 0

        for face in bm.faces:
            j = 0
            coorelement = np.array([])
            # print(face.index)
            for vert in face.verts:
                # vert = loop.vert
                coordinates = np.array([vert.co[0], vert.co[1], vert.co[2]])
                coorelement = np.hstack([coorelement, coordinates])
                new_row_1[j] = vert.index
                j = j + 1
                # print(vert.index, coordinates)

            new_row_2[0] = E
            new_row_2[1] = G
            new_row_2[2] = v
            new_row_2[3] = t
            if DEBUG: print(new_row_1,new_row_2)
            edge_matrix = np.vstack([edge_matrix, new_row_1])
            properties = np.vstack([properties, new_row_2])
            # print(coorelement)
            # print(coormat)
            coormat = np.vstack([coormat, coorelement])
            i += 1
        edge_matrix = np.delete(edge_matrix, 0, 0) # find better way to initialize (redundant)
        properties = np.delete(properties, 0, 0)
        coormat = np.delete(coormat, 0, 0)
        print(coormat)
        if DEBUG: print('edge_matrix',edge_matrix)
        if DEBUG: print('properties',properties)
        # print(edge_matrix.shape)
        max = (edge_matrix[:,1:].max() + 1) * 6
        if DEBUG: print(max)


        bm.edges.ensure_lookup_table()
        # k = np.zeros((12,12))
        K=np.zeros((max,max))
        Kcst=np.zeros((22, 22))
        for e in range(len(edge_matrix)):
            # print(e)
            # print("coormat", coormat)
            # print("1", coormat[e, 0])
            k = self.ElementStiffnessMatrix(properties, coormat[e, :], edge_matrix, e)
            # print(k)

            K = self.SpaceTrussAssemble(K, k, edge_matrix[e, :])
            # Kcst = self.cstassembletest(Kcst, k, edge_matrix[e,:])
            print("e", e)
            K2 = self.test()
            # for i in range(18):
            #     print("K:", K[i,:])
            #     print("K2:", K2[i,:])

            # for i in range(12):
            #     for j in range(12):
            #         pass
            # print("k", k)

        # print("global:")
        # print(K)

        # print(edge_matrix)

        # # create stiffness matrix
        # k = np.zeros((len(edge_matrix),12,12))
        # bm.edges.ensure_lookup_table()
        # for i in range(len(edge_matrix)):
        #     k[i, :, :]=self.SpaceTrussElementStiffness(properties[i, 0],properties[i, 1],properties[i, 2],properties[i, 3],properties[i, 4],properties[i, 5], bm.edges[i].verts[0].co, bm.edges[i].verts[1].co)
        # if DEBUG: print(k.shape)

        # # create global stiffness matrix
        # if TIME: assem_start = timer()
        # K=np.zeros((max,max))
        # for i in range(len(edge_matrix)):        
        #     K=self.SpaceTrussAssemble(K,k[i, :, :],edge_matrix[i,1],edge_matrix[i,2])
        # if TIME: assem_end = timer()
        # print("space truss assemble", assem_end - assem_start)
        # # print("shape:", K.shape)
        # # print("K", K)

        bool = ((self.get_value(input_socket_7)))
        bool = np.invert(bool)
        
        F = self.get_value(input_socket_8)
        # print("Force:", F)
        bool = np.ravel(bool)
        # print("bool after", bool)
        # print(bool.shape)
        boolv,boolh = np.ix_(bool, bool)
        # print(boolv)

        # print(K.shape)

        # apply boundary conditions
        Ksolve = K[boolv,boolh]
        F = F[boolv]
        # print(Ksolve)
        # print(F)
        if DEBUG: print(F.shape)
        F= np.reshape(F, (-1,1))
        # F=F[1:6,:]
        print('applying boundary conditions')
        if DEBUG2: print(Ksolve)
        # print(F)

        # solve for displacement
        Ksolve_csr = sparse.csr_matrix(Ksolve)
        F_csr = sparse.csr_matrix(F)
        print('solving')
        u = scipy.sparse.linalg.spsolve(Ksolve_csr, F_csr)
        print('solving done')
        if DEBUG: print(u)
        bound=np.array([0])
        U=np.zeros((max,1))
        j = 0
        for i in range(len(U)):
            if bool[i] == 1:
                U[i] = u[j]
                j = j + 1

        # print("U")
        # print(U)

        if DEBUG: print(U)
        if TIME: end = timer()
        print("time:", end - start)
        
        
        array = U.tolist()
        self.disp = json.dumps(array)

        return U
                
        # # caclulate force
        # F = K.dot(U)
        # if DEBUG: print(F)

        # sigma = np.zeros([len(edge_matrix)])
        # # calculate stress
        # for i in range(len(edge_matrix)):
        #     store = properties[i,1] / properties[i,0] * np.array([-properties[i,3], -properties[i,4], -properties[i,5], properties[i,3], properties[i,4], properties[i,5]])   
        #     uvect = np.array([U[3 * edge_matrix[i,1]], U[3 * edge_matrix[i,1] + 1], U[3 * edge_matrix[i,1] + 2], U[3 * edge_matrix[i,2]], U[3 * edge_matrix[i,2] + 1], U[3 * edge_matrix[i,2] + 2]])
        #     uvect = uvect.reshape(-1,1)
        #     sigma[i] = store.dot(uvect)
        #     if DEBUG: print(uvect.shape)

        # if DEBUG: print(sigma)

        # # output colors
        #     # currently using displacement as output because this can be done on verticies
        #     # stress output need to find a way to output color to edges
        # # vcol_output = bm.vertex_colors.new()

        # # for v in bm.vertex:
            

        # # apply changes
        # bm.to_mesh(ob)
        # bm.free() # free and prevent further acess



        
        # return object
Example #47
0
def load_youtube_data(prefix, ptrain):
    npz_file = 'data/{}_{}.npz'.format(prefix, ptrain)
    if os.path.exists(npz_file):
        start_time = time()
        print('Found preprocessed dataset {}, loading...'.format(npz_file))
        data = np.load(npz_file)
        num_data     = data['num_data']
        labels       = data['labels']
        train_data   = data['train_data']
        val_data     = data['val_data']
        test_data    = data['test_data']
        adj = sp.csr_matrix((data['adj_data'], data['adj_indices'], data['adj_indptr']), 
                            shape=data['adj_shape'])
        feats = sp.csr_matrix((data['feats_data'], data['feats_indices'], data['feats_indptr']), 
                            shape=data['feats_shape'])
        feats1 = sp.csr_matrix((data['feats1_data'], data['feats1_indices'], data['feats1_indptr']), 
                            shape=data['feats1_shape'])
        print('Finished in {} seconds.'.format(time() - start_time))
    else:
        start_time = time()
        # read edges
        with open('data/'+prefix+'/edges.csv') as f:
            links = [link.split(',') for link in f.readlines()]
            links = [(int(link[0])-1, int(link[1])-1) for link in links]
        links = np.array(links).astype(np.int32)
        num_data = np.max(links)+1
        adj = sp.csr_matrix((np.ones(links.shape[0], dtype=np.float32), 
                             (links[:,0], links[:,1])),
                             shape=(num_data, num_data))
        adj = adj + adj.transpose()

        def _normalize_adj(adj):
            rowsum = np.array(adj.sum(1)).flatten()
            d_inv  = 1.0 / (rowsum+1e-20)
            d_mat_inv = sp.diags(d_inv, 0)
            adj = d_mat_inv.dot(adj)
            return adj

        adj = _normalize_adj(adj)

        feats = sp.eye(num_data, dtype=np.float32).tocsr()
        feats1 = adj.dot(feats)
        num_classes = 47

        labels = np.zeros((num_data, num_classes), dtype=np.float32)
        with open('data/'+prefix+'/group-edges.csv') as f:
            for line in f.readlines():
                line = line.split(',')
                labels[int(line[0])-1, int(line[1])-1] = 1

        data = np.nonzero(labels.sum(1))[0].astype(np.int32)

        np.random.shuffle(data)
        n_train = int(len(data)*ptrain)
        train_data = np.copy(data[:n_train])
        val_data   = np.copy(data[n_train:])
        test_data  = np.copy(data[n_train:])

        num_data, adj, feats, feats1, labels, train_data, val_data, test_data = \
                data_augmentation(num_data, adj, adj, feats, labels, 
                                  train_data, val_data, test_data)

        print("Done. {} seconds.".format(time()-start_time))
        with open(npz_file, 'wb') as fwrite:
            np.savez(fwrite, num_data=num_data, 
                             adj_data=adj.data, adj_indices=adj.indices,
                             adj_indptr=adj.indptr, adj_shape=adj.shape,
                             feats_data=feats.data, feats_indices=feats.indices,
                             feats_indptr=feats.indptr, feats_shape=feats.shape,
                             feats1_data=feats1.data, feats1_indices=feats1.indices,
                             feats1_indptr=feats1.indptr, feats1_shape=feats1.shape,
                             labels=labels,
                             train_data=train_data, val_data=val_data, 
                             test_data=test_data)

    return num_data, adj, feats, feats1, labels, train_data, val_data, test_data
Example #48
0
def partition(A, p, alg, *args):
    A = csr_matrix(A)
    return alg(A.indptr, A.indices, A.data.astype('uint32'), A.get_shape()[1], p, *args)
Example #49
0
 def _get_adj(data, coords):
     adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])),
                         shape=(num_data, num_data))
     return adj
# sparse matrix
from numpy import array
from scipy.sparse import csr_matrix
# create dense matrix
A = array([
	[1, 0, 0, 1, 0, 0],
	[0, 0, 2, 0, 0, 1],
	[0, 0, 0, 2, 0, 0]])
print(A)
# convert to sparse matrix (CSR method)
S = csr_matrix(A)
print(S)
# reconstruct dense matrix
B = S.todense()
print(B)
    def similarityMatrixTopK(self,
                             item_weights,
                             force_sparse_output=True,
                             k=100,
                             verbose=False,
                             inplace=True):
        """
        The function selects the TopK most similar elements, column-wise

        :param item_weights:
        :param force_sparse_output:
        :param k:
        :param verbose:
        :param inplace: Default True, WARNING matrix will be modified
        :return:
        """

        assert (item_weights.shape[0] == item_weights.shape[1]
                ), "selectTopK: ItemWeights is not a square matrix"

        start_time = time.time()

        if verbose:
            print("Generating topK matrix")

        nitems = item_weights.shape[1]
        k = min(k, nitems)

        # for each column, keep only the top-k scored items
        sparse_weights = not isinstance(item_weights, np.ndarray)

        if not sparse_weights:

            print("Sorting columns...")
            idx_sorted = np.argsort(item_weights,
                                    axis=0)  # sort data inside each column
            print("Done!")

            if inplace:
                W = item_weights
            else:
                W = item_weights.copy()

            # index of the items that don't belong to the top-k similar items of each column
            not_top_k = idx_sorted[:-k, :]
            # use numpy fancy indexing to zero-out the values in sim without using a for loop
            W[not_top_k, np.arange(nitems)] = 0.0

            if force_sparse_output:
                if verbose:
                    print("Starting CSR compression...")

                W_sparse = sps.csr_matrix(W, shape=(nitems, nitems))

                if verbose:
                    print("Sparse TopK matrix generated in {:.2f} seconds".
                          format(time.time() - start_time))

                return W_sparse

            if verbose:
                print("Dense TopK matrix generated in {:.2f} seconds".format(
                    time.time() - start_time))

            return W

        else:
            # iterate over each column and keep only the top-k similar items
            data, rows_indices, cols_indptr = [], [], []

            item_weights = check_matrix(item_weights,
                                        format='csc',
                                        dtype=np.float32)

            for item_idx in range(nitems):
                cols_indptr.append(len(data))

                start_position = item_weights.indptr[item_idx]
                end_position = item_weights.indptr[item_idx + 1]

                column_data = item_weights.data[start_position:end_position]
                column_row_index = item_weights.indices[
                    start_position:end_position]

                non_zero_data = column_data != 0

                idx_sorted = np.argsort(
                    column_data[non_zero_data])  # sort by column
                top_k_idx = idx_sorted[-k:]

                data.extend(column_data[non_zero_data][top_k_idx])
                rows_indices.extend(column_row_index[non_zero_data][top_k_idx])

            cols_indptr.append(len(data))

            # During testing CSR is faster

            if verbose:
                print("Generating CSC matrix...")

            W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr),
                                      shape=(nitems, nitems),
                                      dtype=np.float32)

            if verbose:
                print("Converting to CSR...")

            W_sparse = W_sparse.tocsr()

            if verbose:
                print("Sparse TopK matrix generated in {:.2f} seconds".format(
                    time.time() - start_time))

            return W_sparse
Example #52
0
def load_gcn_data(dataset_str):
    npz_file = 'data/{}_{}.npz'.format(dataset_str, FLAGS.normalization)
    if os.path.exists(npz_file):
        start_time = time()
        print('Found preprocessed dataset {}, loading...'.format(npz_file))
        data = np.load(npz_file)
        num_data     = data['num_data']
        labels       = data['labels']
        train_data   = data['train_data']
        val_data     = data['val_data']
        test_data    = data['test_data']
        train_adj = sp.csr_matrix((data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape'])
        full_adj = sp.csr_matrix((data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape'])
        feats = sp.csr_matrix((data['feats_data'], data['feats_indices'], data['feats_indptr']), shape=data['feats_shape'])
        train_feats = sp.csr_matrix((data['train_feats_data'], data['train_feats_indices'], data['train_feats_indptr']), shape=data['train_feats_shape'])
        test_feats = sp.csr_matrix((data['test_feats_data'], data['test_feats_indices'], data['test_feats_indptr']), shape=data['test_feats_shape'])
        print('Finished in {} seconds.'.format(time() - start_time))
    else:
        """Load data."""
        names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(names)):
            with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
                if sys.version_info > (3, 0):
                    objects.append(pkl.load(f, encoding='latin1'))
                else:
                    objects.append(pkl.load(f))

        x, y, tx, ty, allx, ally, graph = tuple(objects)

        if dataset_str != 'nell':
            test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
            test_idx_range = np.sort(test_idx_reorder)

            if dataset_str == 'citeseer':
                # Fix citeseer dataset (there are some isolated nodes in the graph)
                # Find isolated nodes, add them as zero-vecs into the right position
                test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
                tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
                tx_extended[test_idx_range-min(test_idx_range), :] = tx
                tx = tx_extended
                ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
                ty_extended[test_idx_range-min(test_idx_range), :] = ty
                ty = ty_extended

            features = sp.vstack((allx, tx)).tolil()
            features[test_idx_reorder, :] = features[test_idx_range, :]
            adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

            labels = np.vstack((ally, ty))
            labels[test_idx_reorder, :] = labels[test_idx_range, :]

            idx_test = test_idx_range.tolist()
            # idx_train = range(len(y)) 
            idx_train = range(18217) 
            idx_val = range(len(y), len(y)+500)

            train_mask = sample_mask(idx_train, labels.shape[0])
            val_mask = sample_mask(idx_val, labels.shape[0])
            test_mask = sample_mask(idx_test, labels.shape[0])

            y_train = np.zeros(labels.shape)
            y_val = np.zeros(labels.shape)
            y_test = np.zeros(labels.shape)
            y_train[train_mask, :] = labels[train_mask, :]
            y_val[val_mask, :] = labels[val_mask, :]
            y_test[test_mask, :] = labels[test_mask, :]
        else:
            test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
            features = allx.tocsr()
            adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
            labels = ally
            idx_test = test_idx_reorder
            idx_train = range(len(y))
            idx_val = range(len(y), len(y)+969)
            train_mask = sample_mask(idx_train, labels.shape[0])
            val_mask = sample_mask(idx_val, labels.shape[0])
            test_mask = sample_mask(idx_test, labels.shape[0])
            y_train = np.zeros(labels.shape)
            y_val = np.zeros(labels.shape)
            y_test = np.zeros(labels.shape)
            y_train[train_mask, :] = labels[train_mask, :]
            y_val[val_mask, :] = labels[val_mask, :]
            y_test[test_mask, :] = labels[test_mask, :]

        # num_data, (v, coords), feats, labels, train_d, val_d, test_d
        num_data = features.shape[0]
        def _normalize_adj(adj):
            rowsum = np.array(adj.sum(1)).flatten()
            d_inv  = 1.0 / (rowsum+1e-20)
            d_mat_inv = sp.diags(d_inv, 0)
            adj = d_mat_inv.dot(adj).tocoo()
            coords = np.array((adj.row, adj.col)).astype(np.int32)
            return adj.data.astype(np.float32), coords

        def gcn_normalize_adj(adj):
            adj = adj + sp.eye(adj.shape[0])
            rowsum = np.array(adj.sum(1)) + 1e-20
            d_inv_sqrt = np.power(rowsum, -0.5).flatten()
            d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
            d_mat_inv_sqrt = sp.diags(d_inv_sqrt, 0)
            adj = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
            adj = adj.tocoo()
            coords = np.array((adj.row, adj.col)).astype(np.int32)
            return adj.data.astype(np.float32), coords

        # Normalize features
        rowsum = np.array(features.sum(1)) + 1e-9
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv, 0)
        features = r_mat_inv.dot(features)

        if FLAGS.normalization == 'gcn':
            full_v, full_coords = gcn_normalize_adj(adj)
        else:
            full_v, full_coords = _normalize_adj(adj)
        full_v = full_v.astype(np.float32)
        full_coords = full_coords.astype(np.int32)
        train_v, train_coords = full_v, full_coords
        labels = (y_train + y_val + y_test).astype(np.float32)
        train_data = np.nonzero(train_mask)[0].astype(np.int32)
        val_data   = np.nonzero(val_mask)[0].astype(np.int32)
        test_data  = np.nonzero(test_mask)[0].astype(np.int32)

        feats = (features.data, features.indices, features.indptr, features.shape)

        def _get_adj(data, coords):
            adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])), 
                                shape=(num_data, num_data))
            return adj

        train_adj = _get_adj(train_v, train_coords)
        full_adj  = _get_adj(full_v,  full_coords)
        feats = sp.csr_matrix((feats[0], feats[1], feats[2]), 
                              shape=feats[-1], dtype=np.float32)

        train_feats = train_adj.dot(feats)
        test_feats  = full_adj.dot(feats)

        with open(npz_file, 'wb') as fwrite:
            np.savez(fwrite, num_data=num_data, 
                             train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape,
                             full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape,
                             feats_data=feats.data, feats_indices=feats.indices, feats_indptr=feats.indptr, feats_shape=feats.shape,
                             train_feats_data=train_feats.data, train_feats_indices=train_feats.indices, train_feats_indptr=train_feats.indptr, train_feats_shape=train_feats.shape,
                             test_feats_data=test_feats.data, test_feats_indices=test_feats.indices, test_feats_indptr=test_feats.indptr, test_feats_shape=test_feats.shape,
                             labels=labels,
                             train_data=train_data, val_data=val_data, 
                             test_data=test_data)

    return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data
def featurize(movies):
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i

    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
    """
    ###TODO

    #df will be used for calculating tfidf
    df = dict()

    tokens_in_movies = movies['tokens']

    def unique_tokens(tokens):
        uni_tokens = set(tokens)
        return list(uni_tokens)

    # Creating df
    for tokens in tokens_in_movies:
        tokens = unique_tokens(tokens)
        for token in tokens:
            if df.__contains__(token):
                df[token] = df[token] + 1
            else:
                df[token] = 1

    vocab_tokens = [key for key in df]
    vocab_tokens = sorted(vocab_tokens)

    #Creating vocab from df
    vocab = dict()
    col = 0
    for term in vocab_tokens:
        vocab[term] = col
        col = col + 1

    def get_tf(token, tokens):
        count = 0
        for tk in tokens:
            if tk == token:
                count = count + 1
        return count

    def get_max_k(tokens):
        tokenCounts = dict()
        for token in tokens:
            if tokenCounts.__contains__(token):
                tokenCounts[token] = tokenCounts[token] + 1
            else:
                tokenCounts[token] = 1
        return sorted(tokenCounts.items(), key=lambda x: -x[1])[0][1]

    csr_list = []
    N = len(tokens_in_movies)
    indptr = [0]
    indices = []
    data = []
    for tokens in tokens_in_movies:
        for token in tokens:
            if vocab.__contains__(token):
                indices.append(vocab[token])
                idf = math.log10(N / df[token])
                val = float(
                    float(get_tf(token, tokens) / get_max_k(tokens)) * idf)
                data.append(val)
        indptr.append(len(indices))
        csr_list.append(
            csr_matrix((data, indices, indptr), shape=(1, len(vocab))))
        indptr = [0]
        indices = []
        data = []

    movies['features'] = pd.Series(csr_list)
    return movies, vocab
    pass
Example #54
0
def load_graphsage_data(prefix, normalize=True):

    '''
    version_info = map(int, nx.__version__.split('.'))
    major = version_info[0]
    minor = version_info[1]
    assert (major <= 1) and (minor <= 11), "networkx major version must be <= 1.11 in order to load graphsage data"
    '''

    # Save normalized version
    if FLAGS.max_degree==-1:
        npz_file = prefix + '.npz'
    else:
        npz_file = '{}_deg{}.npz'.format(prefix, FLAGS.max_degree)

    if os.path.exists(npz_file):
        start_time = time()
        print('Found preprocessed dataset {}, loading...'.format(npz_file))
        data = np.load(npz_file)
        num_data     = data['num_data']
        feats        = data['feats']
        train_feats  = data['train_feats']
        test_feats   = data['test_feats']
        labels       = data['labels']
        train_data   = data['train_data']
        val_data     = data['val_data']
        test_data    = data['test_data']
        train_adj = sp.csr_matrix((data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape'])
        full_adj  = sp.csr_matrix((data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape'])
        print('Finished in {} seconds.'.format(time() - start_time))
    else:
        print('Loading data...')
        start_time = time()
    
        G_data = json.load(open(prefix + "-G.json"))
        G = json_graph.node_link_graph(G_data)
    
        feats = np.load(prefix + "-feats.npy").astype(np.float32)
        id_map = json.load(open(prefix + "-id_map.json"))
        if list(id_map.keys())[0].isdigit():
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n
        id_map = {conversion(k):int(v) for k,v in id_map.items()}

        walks = []
        class_map = json.load(open(prefix + "-class_map.json"))
        if isinstance(list(class_map.values())[0], list):
            lab_conversion = lambda n : n
        else:
            lab_conversion = lambda n : int(n)
    
        class_map = {conversion(k): lab_conversion(v) for k,v in class_map.items()}

        ## Remove all nodes that do not have val/test annotations
        ## (necessary because of networkx weirdness with the Reddit data)
        broken_count = 0
        to_remove = []
        for node in G.nodes():
            if not node in id_map:
            #if not G.node[node].has_key('val') or not G.node[node].has_key('test'):
                to_remove.append(node)
                broken_count += 1
        for node in to_remove:
            G.remove_node(node)
        print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count))
    
        # Construct adjacency matrix
        print("Loaded data ({} seconds).. now preprocessing..".format(time()-start_time))
        start_time = time()
    
        edges = []
        for edge in G.edges():
            if edge[0] in id_map and edge[1] in id_map:
                edges.append((id_map[edge[0]], id_map[edge[1]]))
        print('{} edges'.format(len(edges)))
        num_data   = len(id_map)

        if FLAGS.max_degree != -1:
            print('Subsampling edges...')
            edges = subsample_edges(edges, num_data, FLAGS.max_degree)

        val_data   = np.array([id_map[n] for n in G.nodes() 
                                 if G.node[n]['val']], dtype=np.int32)
        test_data  = np.array([id_map[n] for n in G.nodes() 
                                 if G.node[n]['test']], dtype=np.int32)
        is_train   = np.ones((num_data), dtype=np.bool)
        is_train[val_data] = False
        is_train[test_data] = False
        train_data = np.array([n for n in range(num_data) if is_train[n]], dtype=np.int32)
        
        train_edges = [(e[0], e[1]) for e in edges if is_train[e[0]] and is_train[e[1]]]
        edges       = np.array(edges, dtype=np.int32)
        train_edges = np.array(train_edges, dtype=np.int32)
    
        # Process labels
        if isinstance(list(class_map.values())[0], list):
            num_classes = len(list(class_map.values())[0])
            labels = np.zeros((num_data, num_classes), dtype=np.float32)
            for k in class_map.keys():
                labels[id_map[k], :] = np.array(class_map[k])
        else:
            num_classes = len(set(class_map.values()))
            labels = np.zeros((num_data, num_classes), dtype=np.float32)
            for k in class_map.keys():
                labels[id_map[k], class_map[k]] = 1
    
        if normalize:
            from sklearn.preprocessing import StandardScaler
            train_ids = np.array([id_map[n] for n in G.nodes() 
                          if not G.node[n]['val'] and not G.node[n]['test']])
            train_feats = feats[train_ids]
            scaler = StandardScaler()
            scaler.fit(train_feats)
            feats = scaler.transform(feats)

        def _normalize_adj(edges):
            adj = sp.csr_matrix((np.ones((edges.shape[0]), dtype=np.float32),
                (edges[:,0], edges[:,1])), shape=(num_data, num_data))
            adj += adj.transpose()

            rowsum = np.array(adj.sum(1)).flatten()
            d_inv  = 1.0 / (rowsum+1e-20)
            d_mat_inv = sp.diags(d_inv, 0)
            adj = d_mat_inv.dot(adj).tocoo()
            coords = np.array((adj.row, adj.col)).astype(np.int32)
            return adj.data, coords

        train_v, train_coords = _normalize_adj(train_edges)
        full_v,  full_coords  = _normalize_adj(edges)

        def _get_adj(data, coords):
            adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])),
                                shape=(num_data, num_data))
            return adj
        
        train_adj = _get_adj(train_v, train_coords)
        full_adj  = _get_adj(full_v,  full_coords)
        train_feats = train_adj.dot(feats)
        test_feats  = full_adj.dot(feats)

        print("Done. {} seconds.".format(time()-start_time))
        with open(npz_file, 'wb') as fwrite:
            print('Saving {} edges'.format(full_adj.nnz))
            np.savez(fwrite, num_data=num_data, 
                             train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape,
                             full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape,
                             feats=feats, train_feats=train_feats, test_feats=test_feats,
                             labels=labels,
                             train_data=train_data, val_data=val_data, 
                             test_data=test_data)

    return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data
Example #55
0
def test_concatenate():
    # dense data
    adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno1': ['c1', 'c2']},
                     {'var_names': ['a', 'b', 'c'],
                      'annoA': [0, 1, 2]})
    adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s3', 's4'],
                      'anno1': ['c3', 'c4']},
                     {'var_names': ['d', 'c', 'b'],
                      'annoA': [0, 1, 2]})
    adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno2': ['d3', 'd4']},
                     {'var_names': ['d', 'c', 'b'],
                      'annoB': [0, 1, 2]})

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch']
    assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2']
    assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]]
    adata = adata1.concatenate(adata2, adata3, batch_key='batch1')
    assert adata.obs_keys() == ['anno1', 'anno2', 'batch1']
    adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3'])
    assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3']
    assert adata.var_names.tolist() == ['b', 'c']

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    from numpy import ma
    Xma = ma.masked_invalid(adata.X)
    Xma_ref = ma.masked_invalid(np.array([
        [1.0, 2.0, 3.0, np.nan],
        [4.0, 5.0, 6.0, np.nan],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0],
        [np.nan, 3.0, 2.0, 1.0],
        [np.nan, 6.0, 5.0, 4.0]]))
    assert np.array_equal(Xma.mask, Xma_ref.mask)
    assert np.allclose(Xma.compressed(), Xma_ref.compressed())
    var_ma = ma.masked_invalid(adata.var.values.tolist())
    var_ma_ref = ma.masked_invalid(np.array(
        [[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0]]))
    assert np.array_equal(var_ma.mask, var_ma_ref.mask)
    assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())

    # sparse data
    from scipy.sparse import csr_matrix
    adata1 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
                     {'obs_names': ['s1', 's2'],
                      'anno1': ['c1', 'c2']},
                     {'var_names': ['a', 'b', 'c']})
    adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]),
                     {'obs_names': ['s3', 's4'],
                      'anno1': ['c3', 'c4']},
                     {'var_names': ['d', 'c', 'b']})
    adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]),
                     {'obs_names': ['s5', 's6'],
                      'anno2': ['d3', 'd4']},
                     {'var_names': ['d', 'c', 'b']})

    # inner join
    adata = adata1.concatenate(adata2, adata3)
    assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]]

    # outer join
    adata = adata1.concatenate(adata2, adata3, join='outer')
    assert adata.X.toarray().tolist() == [
        [0.0, 2.0, 3.0, 0.0],
        [0.0, 5.0, 6.0, 0.0],
        [0.0, 3.0, 2.0, 0.0],
        [0.0, 6.0, 5.0, 0.0],
        [0.0, 0.0, 2.0, 1.0],
        [0.0, 6.0, 5.0, 0.0]]
Example #56
0
 def adjacency_matrix(self):
     E = self.undirected_edges()
     vals = np.squeeze(np.ones((len(E), 1)))
     return sp.csr_matrix((vals, (E[:, 0], E[:, 1])), shape=(self.num_vertices, self.num_vertices))
def check_randomized_svd_low_rank(dtype):
    # Check that extmath.randomized_svd is consistent with linalg.svd
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10
    decimal = 5 if dtype == np.float32 else 7
    dtype = np.dtype(dtype)

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.0,
                             random_state=0).astype(dtype, copy=False)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # Convert the singular values to the specific dtype
    U = U.astype(dtype, copy=False)
    s = s.astype(dtype, copy=False)
    V = V.astype(dtype, copy=False)

    for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = randomized_svd(X,
                                    k,
                                    power_iteration_normalizer=normalizer,
                                    random_state=0)

        # If the input dtype is float, then the output dtype is float of the
        # same bit size (f32 is not upcast to f64)
        # But if the input dtype is int, the output dtype is float64
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype == np.float64
            assert sa.dtype == np.float64
            assert Va.dtype == np.float64

        assert_equal(Ua.shape, (n_samples, k))
        assert_equal(sa.shape, (k, ))
        assert_equal(Va.shape, (k, n_features))

        # ensure that the singular values of both methods are equal up to the
        # real rank of the matrix
        assert_almost_equal(s[:k], sa, decimal=decimal)

        # check the singular vectors too (while not checking the sign)
        assert_almost_equal(np.dot(U[:, :k], V[:k, :]),
                            np.dot(Ua, Va),
                            decimal=decimal)

        # check the sparse matrix representation
        X = sparse.csr_matrix(X)

        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype.kind == 'f'
            assert sa.dtype.kind == 'f'
            assert Va.dtype.kind == 'f'

        assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])
Example #59
0
def form_poisson_equation_impl(height, width, alpha, normals, depth_weight,
                               depth):
    """
    For 4-credit students only
    Creates a Poisson equation given the normals and depth at every pixel in image.
    The solution to Poisson equation is the estimated depth. 
    When the mode, is 'depth' in 'combine.py', the equation should return the actual depth.
    When it is 'normals', the equation should integrate the normals to estimate depth.
    When it is 'both', the equation should weight the contribution from normals and actual depth,
    using  parameter 'depth_weight'.

    Input:
        height -- height of input depth,normal array
        width -- width of input depth,normal array
        alpha -- stores alpha value of at each pixel of image. 
            If alpha = 0, then the pixel normal/depth should not be 
            taken into consideration for depth estimation
        normals -- stores the normals(nx,ny,nz) at each pixel of image
            None if mode is 'depth' in combine.py
        depth_weight -- parameter to tradeoff between normals and depth when estimation mode is 'both'
            High weight to normals mean low depth_weight.
            Giving high weightage to normals will result in smoother surface, but surface may be very different from
            what the input depthmap shows.
        depth -- stores the depth at each pixel of image
            None if mode is 'normals' in combine.py
    Output:
        constants for equation of type Ax = b
        A -- left-hand side coefficient of the Poisson equation 
            note that A can be a very large but sparse matrix so csr_matrix is used to represent it.
        b -- right-hand side constant of the the Poisson equation
    """

    assert alpha.shape == (height, width)
    assert normals is None or normals.shape == (height, width, 3)
    assert depth is None or depth.shape == (height, width)
    '''
    Since A matrix is sparse, instead of filling matrix, we assign values to a non-zero elements only.
    For each non-zero element in matrix A, if A[i,j] = v, there should be some index k such that, 
        row_ind[k] = i
        col_ind[k] = j
        data_arr[k] = v
    Fill these values accordingly
    '''
    row_ind = []
    col_ind = []
    data_arr = []
    '''
    For each row in the system of equation fill the appropriate value for vector b in that row
    '''
    b = []
    if depth_weight is None:
        depth_weight = 1
    '''
    TODO
    Create a system of linear equation to estimate depth using normals and crude depth Ax = b

    x is a vector of depths at each pixel in the image and will have shape (height*width)

    If mode is 'depth':
        > Each row in A and b corresponds to an equation at a single pixel
        > For each pixel k, 
            if pixel k has alpha value zero do not add any new equation.
            else, fill row in b with depth_weight*depth[k] and fill column k of the corresponding
                row in A with depth_weight.

        Justification: 
            Since all the elements except k in a row is zero, this reduces to 
                depth_weight*x[k] = depth_weight*depth[k]
            you may see that, solving this will give x with values exactly same as the depths, 
            at pixels where alpha is non-zero, then why do we need 'depth_weight' in A and b?
            The answer to this will become clear when this will be reused in 'both' mode

    Note: The normals in image are +ve when they are along an +x,+y,-z axes, if seen from camera's viewpoint.
    If mode is 'normals':
        > Each row in A and b corresponds to an equation of relationship between adjacent pixels
        > For each pixel k and its immideate neighbour along x-axis l
            if any of the pixel k or pixel l has alpha value zero do not add any new equation.
            else, fill row in b with nx[k] (nx is x-component of normal), fill column k of the corresponding
                row in A with -nz[k] and column k+1 with value nz[k]
        > Repeat the above along the y-axis as well, except nx[k] should be -ny[k].

        Justification: Assuming the depth to be smooth and almost planar within one pixel width.
        The normal projected in xz-plane at pixel k is perpendicular to tangent of surface in xz-plane.
        In other word if n = (nx,ny,-nz), its projection in xz-plane is (nx,nz) and if tangent t = (tx,0,tz),
            then n.t = 0, therefore nx/-nz = -tz/tx
        Therefore the depth change with change of one pixel width along x axis should be proporational to tz/tx = -nx/nz
        In other words (depth[k+1]-depth[k])*nz[k] = nx[k]
        This is exactly what the equation above represents.
        The negative sign in ny[k] is because the indexing along the y-axis is opposite of +y direction.

    If mode is 'both':
        > Do both of the above steps.

        Justification: The depth will provide a crude estimate of the actual depth. The normals do the smoothing of depth map
        This is why 'depth_weight' was used above in 'depth' mode. 
            If the 'depth_weight' is very large, we are going to give preference to input depth map.
            If the 'depth_weight' is close to zero, we are going to give preference normals.
    '''
    #TODO Block Begin
    #fill row_ind,col_ind,data_arr,b
    raise NotImplementedError()
    #TODO Block end
    # Convert all the lists to numpy array
    row_ind = np.array(row_ind, dtype=np.int32)
    col_ind = np.array(col_ind, dtype=np.int32)
    data_arr = np.array(data_arr, dtype=np.float32)
    b = np.array(b, dtype=np.float32)

    # Create a compressed sparse matrix from indices and values
    A = csr_matrix((data_arr, (row_ind, col_ind)), shape=(row, width * height))

    return A, b
Example #60
0
def bow(category, hNeg=True, noun=False):
           
    category += '/'

    # Carrega as reviews negativas e positivas
    positive = open('sorted_data_acl/' + category  + 'positive.review', 'r')
    negative = open('sorted_data_acl/' + category  + 'negative.review', 'r')

    # Utiliza o BeautifulSoup para ler do xml
    positive_reviews = (BeautifulSoup(positive, 'lxml'))
    negative_reviews = (BeautifulSoup(negative, 'lxml'))

    # Guarda apenas as reviews 
    positive_reviews = positive_reviews.find_all(['review'])
    negative_reviews = negative_reviews.find_all(['review'])

    n_pos_reviews = len(positive_reviews)
    n_neg_reviews = len(negative_reviews)

    # Inicializa a bag of words, vocabulario e os bigramas
    bags = []
    vocabulary = []    
    bigrams = []

    # Trata as reviews positivas
    for review in positive_reviews:

        # Guarda apenas o titulo e o texto de cada review
        review_text = (review.find('title').string + review.find('review_text').string).lower()

        # Tokenize
        review_text = nltk.word_tokenize(review_text)

        # Chama a funcao que faz o tratamento dos dados
        review_text = clear_text(review_text, noun)
        
        # Guarda as palavras no vocabulario
        vocabulary.extend(review_text)
    
        bag = {}
        
        # Conta quantas ocorrencias de cada palavra
        for word in review_text:
            if word in bag:
                bag[word] += 1
            else:
                bag[word] = 1
        
        # Guarda as ocorrencias de cada palavra
        bags.append(bag)
                 
          
    for review in negative_reviews:

        # Guarda apenas o titulo e o texto de cada review
        review_text = (review.find('title').string + review.find('review_text').string).lower()

        # Tokenize
        review_text = nltk.word_tokenize(review_text)

        # Chama a funcao que faz o tratamento dos dados
        review_text = clear_text(review_text, noun)
        
        # Chama a funcao para juntar negacoes em bigramas
        if hNeg:
            review_text = handle_negation(review_text)
             
        # Guarda as palavras no vocabulario
        vocabulary.extend(review_text)
        
        bag = {}

        # Conta quantas ocorrencias de cada palavra
        for word in review_text:
            if word in bag:
                bag[word] += 1
            else:
                bag[word] = 1
        
        # Guarda as ocorrencias de cada palavra
        bags.append(bag)
        
    n_reviews = n_pos_reviews + n_neg_reviews
    
    # sort and get unique words
    vocabulary = list(set(vocabulary))
    
    # generates matrix where m[i][j] is the number of times the word j appears in document i
    matrix = np.zeros((n_reviews, len(vocabulary)), dtype="int")
      
    # Organiza as 'bags of words' em features e salva na matriz com as colunas correspondentes
    for i in range(n_reviews):
        for key in bags[i]:
            index = vocabulary.index(key)
            matrix[i][index] = bags[i][key]
    
    # make target array
    target = np.zeros((n_pos_reviews + n_neg_reviews), dtype="int")
    target[:n_pos_reviews] = 1
     
    #transform matrix in a sparse matrix    
    sMatrix = csr_matrix(matrix) 
        
    return sMatrix, target, vocabulary