def getMetaFeatures(questionPosts, answerPosts): dataByQuestion = {} for qPost in questionPosts: curr = nlp(BeautifulSoup(qPost['Body']).get_text()) numChars = math.log(len(curr.text) + 1) numWords = math.log(len(list(curr)) + 1) numSents = math.log(len(list(curr.sents)) + 1) qWordSum = math.log(len([tok for tok in curr if tok.lemma_ in questionWords]) + 1) qWordData = [qWord in curr.text.lower() for qWord in questionWords] dataByQuestion[qPost['Id']] = [numChars, numWords, numSents, qWordSum] + qWordData + [curr] qMeta = [] aMeta = [] qaSim = [] qVectors = [] aVectors = [] for aPost in answerPosts: curr = nlp(BeautifulSoup(aPost['Body']).get_text()) qData = dataByQuestion[aPost['ParentId']] numChars = math.log(len(curr.text) + 1) numWords = math.log(len(list(curr)) + 1) numSents = math.log(len(list(curr.sents)) + 1) qMeta += [qData[:11]] aMeta += [[numChars, numWords, numSents]] qaSim += [[qData[11].similarity(curr)]] qVectors += [list(qData[11].vector)] aVectors += [list(curr.vector)] return sparse.csr_matrix(qMeta), sparse.csr_matrix(aMeta), sparse.csr_matrix(qaSim), sparse.csr_matrix(qVectors), sparse.csr_matrix(aVectors)
def test_cs_graph_components(): D = np.eye(4, dtype=np.bool) warn_ctx = WarningManager() warn_ctx.__enter__() try: warnings.filterwarnings("ignore", message="`cs_graph_components` is deprecated") n_comp, flag = csgraph.cs_graph_components(csr_matrix(D)) assert_(n_comp == 4) assert_equal(flag, [0, 1, 2, 3]) D[0, 1] = D[1, 0] = 1 n_comp, flag = csgraph.cs_graph_components(csr_matrix(D)) assert_(n_comp == 3) assert_equal(flag, [0, 0, 1, 2]) # A pathological case... D[2, 2] = 0 n_comp, flag = csgraph.cs_graph_components(csr_matrix(D)) assert_(n_comp == 2) assert_equal(flag, [0, 0, -2, 1]) finally: warn_ctx.__exit__()
def test_mkl_spsolve6(): """ MKL splu : Repeated RHS solve (Complex) """ row = np.array([0,0,1,2,2,2]) col = np.array([0,2,2,0,1,2]) data = np.array([1,2,3,-4,5,6], dtype=complex) sM = sp.csr_matrix((data,(row,col)), shape=(3,3), dtype=complex) M = sM.toarray() row = np.array([0,0,1,1,0,0]) col = np.array([0,2,1,1,0,0]) data = np.array([1,1,1,1,1,1], dtype=complex) sN = sp.csr_matrix((data, (row,col)), shape=(3,3), dtype=complex) N = sN.toarray() sX = np.zeros((3,3),dtype=complex) lu = mkl_splu(sM) for k in range(3): sX[:,k] = lu.solve(N[:,k]) lu.delete() X = la.solve(M,N) assert_array_almost_equal(X, sX)
def setUp(self): self.term_map = { 'animal': csr_matrix([0.1, 0.0, 0.3, 0.4]), 'cat': csr_matrix([0.0, 0.5, 0.0, 1.0]) } self.sim = SimCalculator() self.pair = ('cat', 'animal')
def _transform_sparse(self, X): indices = X.indices.copy() indptr = X.indptr.copy() data_step = np.sqrt(X.data * self.sample_interval_) X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False) X_new = [X_step] log_step_nz = self.sample_interval_ * np.log(X.data) step_nz = 2 * X.data * self.sample_interval_ for j in range(1, self.sample_steps): factor_nz = np.sqrt(step_nz / np.cosh(np.pi * j * self.sample_interval_)) data_step = factor_nz * np.cos(j * log_step_nz) X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False) X_new.append(X_step) data_step = factor_nz * np.sin(j * log_step_nz) X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False) X_new.append(X_step) return sp.hstack(X_new)
def die(first_noun, second_noun, trans_verb): """Vectorize a sentence with 'noun die noun verb' = (sub, obj).""" noun_model = space.words.polyglot_model() noun_space = noun_model[0] die_vector = compose.train.die_cat_stored() ver_vector = compose.train.verb(trans_verb, noun_model) fst_vector = noun_space[first_noun] snd_vector = noun_space[second_noun] par_vector_sub = kron( csr_matrix(snd_vector), csr_matrix(ver_vector)) par_vector_obj = kron( csr_matrix(snd_vector), numpy.transpose(csr_matrix(ver_vector))) par_vector_sub = kron( numpy.transpose(csr_matrix(fst_vector)), csr_matrix(par_vector_sub)) par_vector_obj = kron( numpy.transpose(csr_matrix(fst_vector)), csr_matrix(par_vector_obj)) vector_sub = numpy.multiply(csr_matrix(die_vector), par_vector_sub) vector_obj = numpy.multiply(csr_matrix(die_vector), par_vector_obj) return (vector_sub.toarray().flatten(), vector_obj.toarray().flatten())
def test_sparse_concat(self): x_d = np.array([0, 7, 2, 3], dtype=np.float32) x_r = np.array([0, 2, 2, 3], dtype=np.int64) x_c = np.array([4, 3, 2, 3], dtype=np.int64) x_sparse_1 = sparse.csr_matrix((x_d, (x_r, x_c)), shape=(4, 5)) x_d = np.array([0, 7, 2, 3], dtype=np.float32) x_r = np.array([0, 2, 2, 3], dtype=np.int64) x_c = np.array([4, 3, 2, 3], dtype=np.int64) x_sparse_2 = sparse.csr_matrix((x_d, (x_r, x_c)), shape=(4, 5)) x_dense_1 = x_sparse_1.toarray() x_dense_2 = x_sparse_2.toarray() backends = [KTF] if KTH.th_sparse_module: # Theano has some dependency issues for sparse backends.append(KTH) for K in backends: k_s = K.concatenate([K.variable(x_sparse_1), K.variable(x_sparse_2)]) assert K.is_sparse(k_s) k_s_d = K.eval(k_s) k_d = K.eval(K.concatenate([K.variable(x_dense_1), K.variable(x_dense_2)])) assert k_s_d.shape == k_d.shape assert_allclose(k_s_d, k_d, atol=1e-05)
def matrix_completion_task(self): X = sparse.csr_matrix(self._X * (self.descr["mask"] == 0)) Y = sparse.csr_matrix(self._X * (self.descr["mask"] == 1)) assert X.nnz == (self.descr["mask"] == 0).sum() assert Y.nnz == (self.descr["mask"] == 1).sum() # where mask is 2 is neither in X nor Y return X, Y
def geometry(Nr,Nz,parms): r = np.linspace(-parms.Lr, parms.Lr, Nr+1) hr= r[1]-r[0] r = r[::-1] e = np.ones(Nr) Dr = (np.diag(e,-1) - np.diag(e,1))/(2*hr) Dr[0,0:2] = [1,-1]/hr Dr[Nr,Nr-1:Nr+1] = [1,-1]/hr Dr2 = (np.diag(e,-1) - 2*np.diag(np.ones(Nr+1),0) + np.diag(e,1))/hr**2 Dr2[0,0:3] = [1,-2,1]/hr**2 Dr2[Nr,Nr-2:Nr+1] = [1,-2,1]/hr**2 z = np.linspace(-parms.Lz, 0, Nz) hz=z[1]-z[0] z = z[::-1] e = np.ones(Nz-1) Dz = (np.diag(e,-1) - np.diag(e,1))/(2*hz) Dz[0,0:3] = [-3,4,-1]/(2*hz) Dz[Nz-1,Nz-3:Nz] = [1,-4,3]/(2*hz) Dz2 = (np.diag(e,-1) - 2*np.diag(np.ones(Nz),0) + np.diag(e,1))/hz**2 Dz2[0,0:3] = [1,-2,1]/hz**2 Dz2[Nz-1,Nz-3:Nz] = [1,-2,1]/hz**2 Dr = sp.csr_matrix(Dr); Dr2 = sp.csr_matrix(Dr2) Dz = sp.csr_matrix(Dz); Dz2 = sp.csr_matrix(Dz2) return [Dr,Dr2,r,Dz,Dz2,z]
def test_ddp_sorting(): beta = 0.95 # Sorted s_indices = [0, 0, 1] a_indices = [0, 1, 0] a_indptr = [0, 2, 3] R = [0, 1, 2] Q = [(1, 0), (1/2, 1/2), (0, 1)] Q_sparse = sparse.csr_matrix(Q) # Shuffled s_indices_shuffled = [0, 1, 0] a_indices_shuffled = [0, 0, 1] R_shuffled = [0, 2, 1] Q_shuffled = [(1, 0), (0, 1), (1/2, 1/2)] Q_shuffled_sparse = sparse.csr_matrix(Q_shuffled) ddp0 = DiscreteDP(R, Q, beta, s_indices, a_indices) ddp_sparse = DiscreteDP(R, Q_sparse, beta, s_indices, a_indices) ddp_shuffled = DiscreteDP(R_shuffled, Q_shuffled, beta, s_indices_shuffled, a_indices_shuffled) ddp_shuffled_sparse = DiscreteDP(R_shuffled, Q_shuffled_sparse, beta, s_indices_shuffled, a_indices_shuffled) for ddp in [ddp0, ddp_sparse, ddp_shuffled, ddp_shuffled_sparse]: assert_array_equal(ddp.s_indices, s_indices) assert_array_equal(ddp.a_indices, a_indices) assert_array_equal(ddp.a_indptr, a_indptr) assert_array_equal(ddp.R, R) if sparse.issparse(ddp.Q): ddp_Q = ddp.Q.toarray() else: ddp_Q = ddp.Q assert_array_equal(ddp_Q, Q)
def test_pairwise_kernels(metric): # Test the pairwise_kernels helper function. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((2, 4)) function = PAIRWISE_KERNEL_FUNCTIONS[metric] # Test with Y=None K1 = pairwise_kernels(X, metric=metric) K2 = function(X) assert_array_almost_equal(K1, K2) # Test with Y=Y K1 = pairwise_kernels(X, Y=Y, metric=metric) K2 = function(X, Y=Y) assert_array_almost_equal(K1, K2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) assert_array_almost_equal(K1, K2) # Test with sparse X and Y X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) if metric in ["chi2", "additive_chi2"]: # these don't support sparse matrices yet assert_raises(ValueError, pairwise_kernels, X_sparse, Y=Y_sparse, metric=metric) return K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) assert_array_almost_equal(K1, K2)
def _steadystate_direct_sparse(L, verbose=False): """ Direct solver that use scipy sparse matrices """ if verbose: print('Starting direct solver...') n = prod(L.dims[0][0]) b = sp.csr_matrix(([1.0], ([0], [0])), shape=(n ** 2, 1), dtype=complex) M = L.data + sp.csr_matrix((np.ones(n), (np.zeros(n), [nn * (n + 1) for nn in range(n)])), shape=(n ** 2, n ** 2)) use_solver(assumeSortedIndices=True, useUmfpack=False) M.sort_indices() if verbose: start_time = time.time() # Do the actual solving here v = spsolve(M, b) if verbose: print('Direct solver time: ', time.time() - start_time) data = vec2mat(v) data = 0.5 * (data + data.conj().T) return Qobj(data, dims=L.dims[0], isherm=True)
def __init__(self, A, W, **kw): self.columns = kw.get('columns', np.arange(A.shape[1])) self.A = sparse.csr_matrix(A) self.W = sparse.csr_matrix(np.diag(W)) self.O = self.A[:,self.columns].T*self.W*self.A self.O.data = np.log(self.O.data) + 1. self.O.data[np.isnan(self.O.data)] = 0.
def test_paired_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) # Euclidean distance, with Y != X. Y = rng.random_sample((5, 4)) for metric, func in iteritems(PAIRED_DISTANCES): S = paired_distances(X, Y, metric=metric) S2 = func(X, Y) assert_array_almost_equal(S, S2) S3 = func(csr_matrix(X), csr_matrix(Y)) assert_array_almost_equal(S, S3) if metric in PAIRWISE_DISTANCE_FUNCTIONS: # Check the the pairwise_distances implementation # gives the same value distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) distances = np.diag(distances) assert_array_almost_equal(distances, S) # Check the callable implementation S = paired_distances(X, Y, metric='manhattan') S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0)) assert_array_almost_equal(S, S2) # Test that a value error is raised when the lengths of X and Y should not # differ Y = rng.random_sample((3, 4)) assert_raises(ValueError, paired_distances, X, Y)
def test_euclidean_distances(): # Check the pairwise Euclidean distances computation X = [[0]] Y = [[1], [2]] D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]]) X = csr_matrix(X) Y = csr_matrix(Y) D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]]) rng = np.random.RandomState(0) X = rng.random_sample((10, 4)) Y = rng.random_sample((20, 4)) X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1) Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1) # check that we still get the right answers with {X,Y}_norm_squared D1 = euclidean_distances(X, Y) D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) assert_array_almost_equal(D2, D1) assert_array_almost_equal(D3, D1) assert_array_almost_equal(D4, D1) # check we get the wrong answer with wrong {X,Y}_norm_squared X_norm_sq *= 0.5 Y_norm_sq *= 0.5 wrong_D = euclidean_distances(X, Y, X_norm_squared=np.zeros_like(X_norm_sq), Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01)
def text2spvec(self, query): """Create a sparse tfidf-weighted word vector from query. tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5)) """ # Get hashed ngrams words = self.parse(utils.normalize(query)) wids = [utils.hash(w, self.hash_size) for w in words] if len(wids) == 0: if self.strict: raise RuntimeError('No valid word in: %s' % query) else: logger.warning('No valid word in: %s' % query) return sp.csr_matrix((1, self.hash_size)) # Count TF wids_unique, wids_counts = np.unique(wids, return_counts=True) tfs = np.log1p(wids_counts) # Count IDF Ns = self.doc_freqs[wids_unique] idfs = np.log((self.num_docs - Ns + 0.5) / (Ns + 0.5)) idfs[idfs < 0] = 0 # TF-IDF data = np.multiply(tfs, idfs) # One row, sparse csr matrix indptr = np.array([0, len(wids_unique)]) spvec = sp.csr_matrix( (data, wids_unique, indptr), shape=(1, self.hash_size) ) return spvec
def predict(self, X): """ Predict values of X from internal dictionary and intercepts Parameters ---------- X: csr-matrix (n_samples, n_features) Matrix holding the loci of prediction Returns ------- X_pred: csr-matrix (n_samples, n_features) Matrix with the same sparsity structure as X, with predicted values """ X = sp.csr_matrix(X) out = np.zeros_like(X.data) _predict(out, X.indices, X.indptr, self.P_, self.Q_) if self.detrend: for i in range(X.shape[0]): out[X.indptr[i]:X.indptr[i + 1]] += self.row_mean_[i] out += self.col_mean_.take(X.indices, mode='clip') if self.crop is not None: out[out > self.crop[1]] = self.crop[1] out[out < self.crop[0]] = self.crop[0] return sp.csr_matrix((out, X.indices, X.indptr), shape=X.shape)
def test_svd_matrix(W, WT, D, DT): Winv = ss.csr_matrix(np.linalg.pinv(W.todense())) WTinv = ss.csr_matrix(np.linalg.pinv(W.transpose().todense())) # A = np.dot(np.dot(Winv, D), WTinv) A = ((Winv * D) * WTinv) A = A.tocsc() res_dict = {} old_z = 0 for k in range(270, 280): (ut, s, vt) = sparsesvd(A, k) U = ss.csr_matrix(ut.T) S = ss.csr_matrix(np.diag(s)) V = ss.csr_matrix(vt) L = (W * U) * (S * V * WT.transpose()) z = U.shape[1] if z == old_z: break else: Res = fnorm(L, DT) res_dict[z] = Res Result = OrderedDict(sorted(res_dict.items(), key=lambda t: np.float64(t[1]))) old_z = z return Result
def check_smw_solver(p, q, r, s): # Helper to check that _smw_solver results do in fact solve the desired # SMW equation d = q - r A = np.random.normal(size=(p, q)) AtA = np.dot(A.T, A) B = np.zeros((q, q)) B[0:r, 0:r] = np.random.normal(size=(r, r)) di = np.random.uniform(size=d) B[r:q, r:q] = np.diag(1 / di) Qi = np.linalg.inv(B[0:r, 0:r]) s = 0.5 x = np.random.normal(size=p) y2 = np.linalg.solve(s * np.eye(p, p) + np.dot(A, np.dot(B, A.T)), x) f = _smw_solver(s, A, AtA, Qi, di) y1 = f(x) assert_allclose(y1, y2) f = _smw_solver(s, sparse.csr_matrix(A), sparse.csr_matrix(AtA), Qi, di) y1 = f(x) assert_allclose(y1, y2)
def test_unsorted_indices(): # test that the result with sorted and unsorted indices in csr is the same # we use a subset of digits as iris, blobs or make_classification didn't # show the problem digits = load_digits() X, y = digits.data[:50], digits.target[:50] X_test = sparse.csr_matrix(digits.data[50:100]) X_sparse = sparse.csr_matrix(X) coef_dense = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X, y).coef_ sparse_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse, y) coef_sorted = sparse_svc.coef_ # make sure dense and sparse SVM give the same result assert_array_almost_equal(coef_dense, coef_sorted.toarray()) X_sparse_unsorted = X_sparse[np.arange(X.shape[0])] X_test_unsorted = X_test[np.arange(X_test.shape[0])] # make sure we scramble the indices assert_false(X_sparse_unsorted.has_sorted_indices) assert_false(X_test_unsorted.has_sorted_indices) unsorted_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse_unsorted, y) coef_unsorted = unsorted_svc.coef_ # make sure unsorted indices give same result assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray()) assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted), sparse_svc.predict_proba(X_test))
def check_matrix_and_delay(matrix=None, delay_matrix=None, source_size=None, target_size=None, name='connection matrix'): if matrix is None: if delay_matrix is not None: raise ValueError('You cannot have a delay matrix without having a weight matrix') print('No %s found, making empty matrix' % name) matrix = spsp.csr_matrix((source_size, target_size)) else: if source_size is not None: assert source_size == matrix.shape[0] else: source_size = matrix.shape[0] if target_size is not None: assert target_size == matrix.shape[1] else: target_size = matrix.shape[1] if delay_matrix is None: print('No delays of %s found, making empty matrix' % name) delay_matrix = spsp.csr_matrix((source_size, target_size)) if spsp.issparse(delay_matrix): data = delay_matrix.data else: data = delay_matrix if len(data)> 0: max_delay = np.max(data) else: max_delay = 0.0 return matrix, delay_matrix, max_delay
def HiptmairMatrixSetup(mesh, N, M): path = os.path.abspath(os.path.join(inspect.getfile(inspect.currentframe()), "..")) if __version__ == '1.6.0': gradient_code = open(os.path.join(path, 'DiscreteGradientSecond.cpp'), 'r').read() else: gradient_code = open(os.path.join(path, 'DiscreteGradient.cpp'), 'r').read() compiled_gradient_module = compile_extension_module(code=gradient_code) column = numpy.zeros(2*mesh.num_edges(), order="C") #, dtype="intc") row = numpy.zeros(2*mesh.num_edges(), order="C") #, dtype="intc") data = numpy.zeros(2*mesh.num_edges(), order="C") #, dtype="intc") dataX = numpy.zeros(2*mesh.num_edges(), order="C") dataY = numpy.zeros(2*mesh.num_edges(), order="C") dataZ = numpy.zeros(2*mesh.num_edges(), order="C") tic() c = compiled_gradient_module.ProlongationGradsecond(mesh, dataX,dataY,dataZ, data, row, column) end = toc() MO.StrTimePrint("Data for C and P created, time: ",end) # print row # print column # print data C = csr_matrix((data,(row,column)), shape=(N, M)).tocsr() Px = csr_matrix((dataX,(row,column)), shape=(N, M)).tocsr() Py = csr_matrix((dataY,(row,column)), shape=(N, M)).tocsr() Pz = csr_matrix((dataZ,(row,column)), shape=(N, M)).tocsr() return C, [Px,Py,Pz]
def __init__(self,dimension,alpha,lambda_,n,alpha_2, cluster_init="Complete"): self.time = 0 #N_LinUCBAlgorithm.__init__(dimension = dimension, alpha=alpha,lambda_ = lambda_,n=n) self.users = [] #algorithm have n users, each user has a user structure for i in range(n): self.users.append(CLUBUserStruct(dimension,lambda_, i)) self.dimension = dimension self.alpha = alpha self.alpha_2 = alpha_2 if (cluster_init=="Erdos-Renyi"): p = 3*math.log(n)/n self.Graph = np.random.choice([0, 1], size=(n,n), p=[1-p, p]) self.clusters = [] g = csr_matrix(self.Graph) N_components, components = connected_components(g) else: self.Graph = np.ones([n,n]) self.clusters = [] g = csr_matrix(self.Graph) N_components, components = connected_components(g) self.CanEstimateCoUserPreference = False self.CanEstimateUserPreference = False self.CanEstimateW = False
def get_sij(rij): rji = rij.transpose() dij = csr_matrix(get_dij(rij)) dji = csr_matrix(get_dij(rji)) rijcsr = csr_matrix(rij) res = dij.dot(rijcsr).dot(dji) return res.todense()
def gen_app_pop_count(dev_app, ga_train, ga_test, base_dir='/data'): start_time = time.time() print('generating popularity weighted app count per device') app_popularity = dev_app.groupby(['app_id'])['device_id'].agg( {'popularity': lambda x: x.nunique()}) app_pop_count = dev_app.groupby(['device_id'])['app_id'].agg( {'app_pop_count': lambda x: app_popularity.loc[x.unique(), 'popularity'].sum()}) app_count_train = ga_train['device_id'].map( app_pop_count['app_pop_count']).fillna(0) app_count_train = app_count_train / app_count_train.max() app_count_train = csr_matrix(app_count_train.values).transpose() app_count_test = ga_test['device_id'].map(app_pop_count['app_pop_count']).fillna(0) app_count_test = app_count_test / app_count_test.max() app_count_test = csr_matrix(app_count_test.values).transpose() print('train set shape: ', app_count_train.shape) io.mmwrite(base_dir + "train_apppopcount.mtx", app_count_train) print('test set shape: ', app_count_test.shape) io.mmwrite(base_dir + "test_apppopcount.mtx", app_count_test) print('Time generating app pop count: ', (time.time() - start_time) / 60)
def test_multiclass_to_ranking(): X = sp.csr_matrix(np.arange(6).reshape((3,2))) y = sp.csr_matrix((3, 5)) y[0, 0] = 1 y[1, [2, 3]] = 1 y[2, [0, 4]] = 1 n_classes = y.shape[1] n_samples = X.shape[0] n_features = X.shape[1] X_ext, compars = multiclass_to_ranking(X, y) assert X_ext.shape[0] == n_classes * n_samples assert X_ext.shape[1] == n_classes + n_features # test that features are replicated assert_array_equal(X_ext.tocsc()[:, n_classes:].sum(axis=0), X.sum(axis=0) * n_classes) # test class labels encoding structure assert X_ext.tocsr()[:n_classes, :].sum(axis=0)[0, 0] == n_samples assert X_ext.tocsr()[:, :n_classes].sum() == n_samples * n_classes #assert_array_equal(X_ext.tocsc()[:, n_classes:].sum(axis=0), print X_ext.todense() print y.todense() print compars
def setUp(self): # 3---4 # / | / | # 0---1---2 G0 = array([[0, 1, 0, 1, 0], [1, 0, 1, 1, 1], [0, 1, 0, 0, 1], [1, 1, 0, 0, 1], [0, 1, 1, 1, 0]]) self.G0 = csr_matrix(G0) # make sure graph is symmetric assert_equal((self.G0 - self.G0.T).nnz, 0) # 2 5 # | \ / | # 0--1--3--4 G1 = array([[0, 1, 1, 0, 0, 0], [1, 0, 1, 1, 0, 0], [1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 1], [0, 0, 0, 1, 0, 1], [0, 0, 0, 1, 1, 0]]) self.G1 = csr_matrix(G1) # make sure graph is symmetric assert_equal((self.G1 - self.G1.T).nnz, 0)
def test_feature_inference_fails(): # On predict if we try to use feature inference and supply # higher ids than the number of features that were supplied to fit # we should complain no_users, no_items = (10, 100) no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) with pytest.raises(AssertionError): model.predict(np.array([no_features], dtype=np.int32), np.array([no_features], dtype=np.int32))
def test_sparse_quad_obj(self): times_dense = [] times_sparse = [] for n in [20, 200, 2000]: m1 = n/2 A_sparse = 0.9 data = generate_data(n=n, m1=m1, A_sparse=A_sparse) A, b, x_true = data['A'], data['b'], data['x_true'] A_sparse = sps.csr_matrix(A) A_sparse_T = sps.csr_matrix(A.T) Q, c = construct_qp_from_least_squares(A, b) Q_sparse = sps.csr_matrix(Q) def obj_np(x, g): return quad_obj_np(x, Q, c, g) def obj_sparse(x, g): return sparse_least_squares_obj(x, A_sparse_T, A_sparse, b, g) g = np.zeros(n) start_time = time.time() obj_np(x_true, g) times_dense.append(time.time() - start_time) start_time = time.time() obj_sparse(x_true, g) times_sparse.append(time.time() - start_time) print 'times for sparse QP', times_sparse print 'times for dense QP', times_dense
def tidyup(self,atol=qset.auto_tidyup_atol): """Removes small elements from a quantum object. Parameters ---------- atol : float Absolute tolerance used by tidyup. Default is set via qutip global settings parameters. Returns ------- oper: qobj Quantum object with small elements removed. """ abs_data=abs(self.data.data.flatten()) if any(abs_data): mx=max(abs_data) if mx>=1e-15: data=abs(self.data.data) outdata=self.data.copy() outdata.data[data<(atol*mx+np.finfo(float).eps)]=0 else: outdata=sp.csr_matrix((self.shape[0],self.shape[1]),dtype=complex) else: outdata=sp.csr_matrix((self.shape[0],self.shape[1]),dtype=complex) outdata.eliminate_zeros() return Qobj(outdata,dims=self.dims,shape=self.shape,type=self.type,isherm=self.isherm)
def links2vec(links,out_path,tmp_path,dim=100,cds=1.0,eig=0.5,verbose='none'): logger = logging.getLogger(__name__ + ".links2vec") #80204: Language Learning - Clustering pipeline January 2018.ipynb '''links => PMI''' #-cds = 1.0 # cds = float(args['--cds']) # Context distribution smoothing [default: 1.0] pmi_path = tmp_path + 'pmi' start = time.time() #-linkz = links.loc[(links['count'] > 2)] linkz = links words = linkz.groupby('word').sum().reset_index() \ .sort_values(by=['count','word'], ascending=[False,True]) contexts = linkz.groupby('link').sum().reset_index() \ .sort_values(by=['count','link'], ascending=[False,True]) # if verbose in ['max','debug']: # print('Linkz:', len(linkz), 'items') # with pd.option_context('display.max_rows', 6): print(linkz) # print('words:', len(words), 'items') # with pd.option_context('display.max_rows', 6): print(words,'\n') # print('contexts:', len(contexts), 'items') # with pd.option_context('display.max_rows', 6): print(contexts) logger.info(f'Linkz: {len(linkz)} items') with pd.option_context('display.max_rows', 6): logger.info(f"{linkz}") logger.info(f'words: {len(words)} items') with pd.option_context('display.max_rows', 6): logger.info(f'{words}\n') logger.info(f'contexts: {len(contexts)} items') with pd.option_context('display.max_rows', 6): logger.info(f"{contexts}") iw = sorted(words['word'].drop_duplicates().values.tolist()) ic = sorted(contexts['link'].drop_duplicates().values.tolist()) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 # ~ batch size i = 0 for row in linkz.itertuples(): if row.word in wi and row.link in ci: tmp_counts[wi[row.word], ci[row.link]] = int(row.count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() list2tsv(iw, pmi_path + '.words.vocab') # any need to save? list2tsv(ic, pmi_path + '.contexts.vocab') # if verbose in ['max','debug']: print('PMI data saved to', pmi_path) logger.info(f'PMI data saved to {pmi_path}') pmi = calc_pmi(counts, cds) np.savez_compressed(pmi_path, \ data=pmi.data, indices=pmi.indices, indptr=pmi.indptr, shape=pmi.shape) # if verbose in ['max','debug']: # print('PMI matrix', type(pmi), pmi.shape, '\nsaved to', pmi_path) logger.info(f'PMI matrix {type(pmi)}, {pmi.shape}\nsaved to {pmi_path}') '''PMI => SVD''' svd_path = pmi_path[:-3] + 'svd' neg = 1 # int(args['--neg']) Number of negative samples; # [default: 1] subtracts its log from PMI # if verbose in ['max','debug']: # print('SVD started: dim', dim, ', output:', svd_path+'...') logger.info(f'SVD started: dim {dim}, output: {svd_path}...') explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(svd_path + '.ut.npy', ut) np.save(svd_path + '.s.npy', s) np.save(svd_path + '.vt.npy', vt) list2tsv(explicit.iw, svd_path + '.words.vocab') # any need to save? list2tsv(explicit.ic, svd_path + '.contexts.vocab') # if verbose in ['max','debug']: # print('SVD matrix (3 files .npy) saved:', len(ut[0]), 'vectors, ', \ # 'ut:', len(ut), 's:', len(s), 'vt:', len(vt)) logger.info(f'SVD matrix (3 files .npy) saved: {len(ut[0])} vectors, ut: {len(ut)}, s: {len(s)}, vt: {len(vt)}') '''SVD => vectors.txt''' out_file = out_path + 'vectors.txt' svd = SVDEmbedding(svd_path, True, eig) with open(out_file, 'w') as file: for i, w in enumerate(svd.iw): file.write(w+' '+(' '.join([str(x) for x in svd.m[i]]))+'\n') readme_path = out_path + 'vectors_readme.txt' readme = 'Word vectors: dimension '+str(dim)+', '+str(len(svd.iw))+' vectors' with open(readme_path, 'w') as f: f.write(readme) # if verbose != 'none': # print('vectors saved to\n', out_file, \ # '- elapsed', int(round(time.time() - start, 0)), 's ~', \ # round((time.time() - start)/len(ut[0])*1000, 3), 'ms/vector') logger.warning(f'vectors saved to\n {out_file} - elapsed {int(round(time.time() - start, 0))} s ~ ' f'{round((time.time() - start)/len(ut[0])*1000, 3)} ms/vector') response = {'vectors_file': out_file} return response
def load_sparse_csr(filename): loader = np.load(filename) return sparse.csr_matrix( (loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"], dtype=np.float32)
def convert_X(mat): mat = csr_matrix(mat) return mat
def represent(self, w): if w in self.wi: return self.m[self.wi[w], :] else: return csr_matrix((1, len(self.ic)))
def load_matrix(f): if not f.endswith('.npz'): f += '.npz' loader = np.load(f) return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
def pmisvd(links,path,tmpath, dim=100, cds=1.0, eig=0.5, neg=1, verbose='none'): logger = logging.getLogger(__name__ + ".pmisvd") '''80223 epmisvd enhanced: return +singular values''' # path - dir to save vectors.txt and readme # path - dir to save temporary files # cds = 1.0 # context distribution smoothing [default: 1.0] # eig = 0.5 # weighted exponent of the eigenvalue matrix [default: 0.5] # neg = 1 # Number of negative samples; [default: 1] subtracts its log from PMI # PMI => SVD PositiveExplicit parameter if tmpath[-1] == '/': tmpath = tmpath[:-1] if path[-1] == '/': path = path[:-1] '''links => PMI''' pmi_path = tmpath + '/pmi' start = time.time() #-linkz = links.loc[(links['count'] > 2)] linkz = links words = linkz.groupby('word').sum().reset_index()\ .sort_values(by=['count','word'], ascending=[False,True]) contexts = linkz.groupby('link').sum().reset_index()\ .sort_values(by=['count','link'], ascending=[False,True]) iw = sorted(words['word'].drop_duplicates().values.tolist()) ic = sorted(contexts['link'].drop_duplicates().values.tolist()) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 # ~ batch size i = 0 for row in linkz.itertuples(): if row.word in wi and row.link in ci: tmp_counts[wi[row.word], ci[row.link]] = int(row.count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() list2tsv(iw, pmi_path + '.words.vocab') # any need to save? list2tsv(ic, pmi_path + '.contexts.vocab') '''counts + vocab => pmi''' pmi = calc_pmi(counts, cds) np.savez_compressed(pmi_path, \ data=pmi.data, indices=pmi.indices, indptr=pmi.indptr, shape=pmi.shape) '''PMI => SVD''' svd_path = pmi_path[:-3] + 'svd' explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(svd_path + '.ut.npy', ut) np.save(svd_path + '.s.npy', s) np.save(svd_path + '.vt.npy', vt) list2tsv(explicit.iw, svd_path + '.words.vocab') # any need to save? list2tsv(explicit.ic, svd_path + '.contexts.vocab') '''SVD => vectors.txt''' svd = SVDEmbedding(svd_path, True, eig) # TODO: move code here, RAM2RAM if len(svd.m[0]) < dim: dim = len(svd.m[0]) # 80216 vectors_df = pd.DataFrame(columns=['word'] + list(range(1,dim+1))) for i, w in enumerate(svd.iw): vectors_df.loc[i] = [w] + svd.m[i].tolist() out_file = path + '/vectors.txt' with open(out_file, 'w') as file: for i, w in enumerate(svd.iw): file.write(w+' '+(' '.join([str(x) for x in svd.m[i]]))+'\n') readme_path = path + '/vectors_readme.txt' readme = 'Word vectors: dimension '+str(dim)+', '+str(len(svd.iw))+' vectors' with open(readme_path, 'w') as f: f.write(readme) singular_values = s.tolist() # type(s): numpy.ndarray return vectors_df, singular_values, {'vectors_file': out_file}
def epmisvd(links,path,tmpath,dim=100,cds=1.0,eig=0.5,neg=1,verbose='none'): logger = logging.getLogger(__name__ + ".epmisvd") # cds = 1.0 # context distribution smoothing [default: 1.0] # eig = 0.5 # weighted exponent of the eigenvalue matrix [default: 0.5] # neg = 1 # Number of negative samples; [default: 1] subtracts its log from PMI # PMI => SVD PositiveExplicit parameter '''links => PMI''' pmi_path = tmpath + 'pmi' start = time.time() #-linkz = links.loc[(links['count'] > 2)] linkz = links words = linkz.groupby('word').sum().reset_index()\ .sort_values(by=['count','word'], ascending=[False,True]) contexts = linkz.groupby('link').sum().reset_index()\ .sort_values(by=['count','link'], ascending=[False,True]) # if verbose in ['max','debug']: # print('Linkz:', len(linkz), 'items') # with pd.option_context('display.max_rows', 6): print(linkz) # print('words:', len(words), 'items') # with pd.option_context('display.max_rows', 6): print(words,'\n') # print('contexts:', len(contexts), 'items') # with pd.option_context('display.max_rows', 6): print(contexts) logger.info(f'Linkz: {len(linkz)} items') with pd.option_context('display.max_rows', 6): logger.info(f'{linkz}') logger.info(f'words: {len(words)} items') with pd.option_context('display.max_rows', 6): logger.info(f'{words}\n') logger.info(f'contexts: {len(contexts)} items') with pd.option_context('display.max_rows', 6): logger.info(f'{contexts}') iw = sorted(words['word'].drop_duplicates().values.tolist()) ic = sorted(contexts['link'].drop_duplicates().values.tolist()) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 # ~ batch size i = 0 for row in linkz.itertuples(): if row.word in wi and row.link in ci: tmp_counts[wi[row.word], ci[row.link]] = int(row.count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() list2tsv(iw, pmi_path + '.words.vocab') # any need to save? list2tsv(ic, pmi_path + '.contexts.vocab') # if verbose in ['max','debug']: print('PMI data saved to', pmi_path) logger.info('PMI data saved to' + pmi_path) '''counts + vocab => pmi''' pmi = calc_pmi(counts, cds) np.savez_compressed(pmi_path, \ data=pmi.data, indices=pmi.indices, indptr=pmi.indptr, shape=pmi.shape) # if verbose in ['max','debug']: # print('PMI matrix', type(pmi), pmi.shape, '\nsaved to', pmi_path) logger.info(f'PMI matrix {type(pmi)} {pmi.shape}\nsaved to {pmi_path}') '''PMI => SVD''' svd_path = pmi_path[:-3] + 'svd' # if verbose in ['max','debug']: # print('SVD started: dim', dim, ', output:', svd_path+'...') logger.info(f'SVD started: dim {dim}, output: {svd_path}...') explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) #print('explicit.m:', explicit.m) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(svd_path + '.ut.npy', ut) np.save(svd_path + '.s.npy', s) np.save(svd_path + '.vt.npy', vt) list2tsv(explicit.iw, svd_path + '.words.vocab') # any need to save? list2tsv(explicit.ic, svd_path + '.contexts.vocab') # if verbose in ['max','debug']: # print('SVD matrix (3 files .npy) saved:', len(ut[0]), 'vectors, ', \ # 'ut:', len(ut), 's:', len(s), 'vt:', len(vt)) logger.info(f'SVD matrix (3 files .npy) saved: {len(ut[0])} vectors, ut: {len(ut)} s: {len(s)} vt:{len(vt)}') '''SVD => vectors.txt''' svd = SVDEmbedding(svd_path, True, eig) # TODO: move code here, RAM2RAM if len(svd.m[0]) < dim: dim = len(svd.m[0]) # 80216 vectors_df = pd.DataFrame(columns=['word'] + list(range(1,dim+1))) for i, w in enumerate(svd.iw): vectors_df.loc[i] = [w] + svd.m[i].tolist() out_file = path + 'vectors.txt' with open(out_file, 'w') as file: for i, w in enumerate(svd.iw): file.write(w+' '+(' '.join([str(x) for x in svd.m[i]]))+'\n') readme_path = path + 'vectors_readme.txt' readme = 'Word vectors: dimension '+str(dim)+', '+str(len(svd.iw))+' vectors' with open(readme_path, 'w') as f: f.write(readme) # if verbose in ['max','debug']: # print('vectors saved to\n', out_file, \ # '- elapsed', int(round(time.time() - start, 0)), 's ~', \ # round((time.time() - start)/len(ut[0])*1000, 3), 'ms/vector') logger.info(f'vectors saved to\n {out_file} - elapsed {int(round(time.time() - start, 0))} s ~ ' f'{round((time.time() - start)/len(ut[0])*1000, 3)} ms/vector') response = {'vectors_file': out_file} return vectors_df, response
import numpy as np import scipy.sparse as sps from External_Libraries.Notebooks_utils.data_splitter import train_test_holdout from External_Libraries.Similarity.Compute_Similarity_Python import Compute_Similarity_Python from External_Libraries.Notebooks_utils.evaluation_function import evaluate_algorithm import matplotlib.pyplot as pyplot from External_Libraries.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender from External_Libraries.KNN.UserKNNCFRecommender import UserKNNCFRecommender from External_Libraries.ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt from External_Libraries.ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs from External_Libraries.DataIO import DataIO from External_Libraries.Base.Recommender_utils import check_matrix from External_Libraries.Base.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender from External_Libraries.SLIM_ElasticNet.SLIMElasticNetRecommender import SLIMElasticNetRecommender URM_all = sps.csr_matrix(sps.load_npz("../../Dataset/old/data_all.npz")) URM_train = sps.csr_matrix(sps.load_npz("../../Dataset/old/data_train.npz")) URM_test = sps.csr_matrix(sps.load_npz("../../Dataset/old/data_test.npz")) class ItemKNNScoresHybridRecommender(BaseItemSimilarityMatrixRecommender): RECOMMENDER_NAME = "ItemKNNScoresHybridRecommender" def __init__(self, URM_train, Recommender_1, Recommender_2): super(ItemKNNScoresHybridRecommender, self).__init__(URM_train) self.URM_train = check_matrix(URM_train.copy(), 'csr') self.Recommender_1 = Recommender_1 self.Recommender_2 = Recommender_2 def fit(self, alpha):
def testSolveTriangular(self): from mars.tensor import tril, triu np.random.seed(1) data1 = np.random.randint(1, 10, (20, 20)) data2 = np.random.randint(1, 10, (20, )) A = tensor(data1, chunk_size=20) b = tensor(data2, chunk_size=20) x = solve_triangular(A, b) t = triu(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) x = solve_triangular(A, b, lower=True) t = tril(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) A = tensor(data1, chunk_size=10) b = tensor(data2, chunk_size=10) x = solve_triangular(A, b) t = triu(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) x = solve_triangular(A, b, lower=True) t = tril(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) data1 = np.random.randint(1, 10, (10, 10)) data2 = np.random.randint(1, 10, (10, 5)) A = tensor(data1, chunk_size=10) b = tensor(data2, chunk_size=10) x = solve_triangular(A, b) t = triu(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) x = solve_triangular(A, b, lower=True) t = tril(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) A = tensor(data1, chunk_size=3) b = tensor(data2, chunk_size=3) x = solve_triangular(A, b) t = triu(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) x = solve_triangular(A, b, lower=True) t = tril(A).dot(x) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data2) # test sparse data1 = sps.csr_matrix(np.triu(np.random.randint(1, 10, (10, 10)))) data2 = np.random.random((10, )) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve_triangular(A, b) result_x = self.executor.execute_tensor(x, concat=True)[0] result_b = data1.dot(result_x) self.assertIsInstance(result_x, SparseNDArray) np.testing.assert_allclose(result_b, data2) data1 = sps.csr_matrix(np.triu(np.random.randint(1, 10, (10, 10)))) data2 = np.random.random((10, 2)) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve_triangular(A, b) result_x = self.executor.execute_tensor(x, concat=True)[0] result_b = data1.dot(result_x) self.assertIsInstance(result_x, SparseNDArray) np.testing.assert_allclose(result_b, data2)
def test_check_array(): # accept_sparse == False # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) with pytest.raises(TypeError): check_array(X_csr) # ensure_2d=False X_array = check_array([0, 1, 2], ensure_2d=False) assert X_array.ndim == 1 # ensure_2d=True with 1d array with pytest.raises(ValueError, match="Expected 2D array," " got 1D array instead"): check_array([0, 1, 2], ensure_2d=True) # ensure_2d=True with scalar array with pytest.raises(ValueError, match="Expected 2D array," " got scalar array instead"): check_array(10, ensure_2d=True) # don't allow ndim > 3 X_ndim = np.arange(8).reshape(2, 2, 2) with pytest.raises(ValueError): check_array(X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # dtype and order enforcement. X_C = np.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(int) X_float = X_C.astype(float) Xs = [X_C, X_F, X_int, X_float] dtypes = [np.int32, int, float, np.float32, None, bool, object] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy) if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if order == 'C': assert X_checked.flags['C_CONTIGUOUS'] assert not X_checked.flags['F_CONTIGUOUS'] elif order == 'F': assert X_checked.flags['F_CONTIGUOUS'] assert not X_checked.flags['C_CONTIGUOUS'] if copy: assert X is not X_checked else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert X is X_checked # allowed sparse != None X_csc = sp.csc_matrix(X_C) X_coo = X_csc.tocoo() X_dok = X_csc.todok() X_int = X_csc.astype(int) X_float = X_csc.astype(float) Xs = [X_csc, X_coo, X_dok, X_int, X_float] accept_sparses = [['csr', 'coo'], ['coo', 'dok']] for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, copys): with warnings.catch_warnings(record=True) as w: X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy) if (dtype is object or sp.isspmatrix_dok(X)) and len(w): # XXX unreached code as of v0.22 message = str(w[0].message) messages = ["object dtype is not supported by sparse matrices", "Can't check dok sparse matrix for nan or inf."] assert message in messages else: assert len(w) == 0 if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if X.format in accept_sparse: # no change if allowed assert X.format == X_checked.format else: # got converted assert X_checked.format == accept_sparse[0] if copy: assert X is not X_checked else: # doesn't copy if it was already good if X.dtype == X_checked.dtype and X.format == X_checked.format: assert X is X_checked # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert isinstance(X_dense, np.ndarray) # raise on too deep lists with pytest.raises(ValueError): check_array(X_ndim.tolist()) check_array(X_ndim.tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = _NotAnArray(X_dense) result = check_array(X_no_array) assert isinstance(result, np.ndarray)
def test_variance_threshold(): """Test VarianceThreshold with custom variance.""" for X in [data, csr_matrix(data)]: X = VarianceThreshold(threshold=.4).fit_transform(X) assert_equal((len(data), 1), X.shape)
def testSolve(self): import scipy.linalg np.random.seed(1) data1 = np.random.randint(1, 10, (20, 20)) data2 = np.random.randint(1, 10, (20, )) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve(A, b) res = self.executor.execute_tensor(x, concat=True)[0] np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] np.testing.assert_allclose(res, data2) data2 = np.random.randint(1, 10, (20, 5)) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve(A, b) res = self.executor.execute_tensor(x, concat=True)[0] np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] np.testing.assert_allclose(res, data2) data2 = np.random.randint(1, 10, (20, 20)) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve(A, b) res = self.executor.execute_tensor(x, concat=True)[0] np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] np.testing.assert_allclose(res, data2) # test for not all chunks are square in matrix A data2 = np.random.randint(1, 10, (20, )) A = tensor(data1, chunk_size=6) b = tensor(data2, chunk_size=6) x = solve(A, b) res = self.executor.execute_tensor(x, concat=True)[0] np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] np.testing.assert_allclose(res, data2) A = tensor(data1, chunk_size=(7, 6)) b = tensor(data2, chunk_size=6) x = solve(A, b) res = self.executor.execute_tensor(x, concat=True)[0] np.testing.assert_allclose(res, scipy.linalg.solve(data1, data2)) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] np.testing.assert_allclose(res, data2) # test sparse data1 = sps.csr_matrix(np.random.randint(1, 10, (20, 20))) data2 = np.random.randint(1, 10, (20, )) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve(A, b) res = self.executor.execute_tensor(x, concat=True)[0] self.assertIsInstance(res, SparseNDArray) np.testing.assert_allclose(data1.dot(res), data2) data2 = np.random.randint(1, 10, (20, 5)) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve(A, b) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] self.assertIsInstance(res, SparseNDArray) np.testing.assert_allclose(res, data2) data2 = np.random.randint(1, 10, (20, 20)) A = tensor(data1, chunk_size=5) b = tensor(data2, chunk_size=5) x = solve(A, b) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] self.assertIsInstance(res, SparseNDArray) np.testing.assert_allclose(res, data2) # test for not all chunks are square in matrix A data2 = np.random.randint(1, 10, (20, )) A = tensor(data1, chunk_size=6) b = tensor(data2, chunk_size=6) x = solve(A, b) res = self.executor.execute_tensor(A.dot(x), concat=True)[0] np.testing.assert_allclose(res, data2)
def _read_facts(fact_file, relation_embeddings, question_embedding, seeds, qId): """Read all triples from the fact file and create a sparse adjacency matrix between the entities. Returns mapping of entities to their indices, a mapping of relations to the and the combined adjacency matrix.""" seeds_found = set() with open(fact_file) as f: entity_map = {} relation_map = {} all_row_ones, all_col_ones = [], [] num_entities = 0 num_facts = 0 for line in f: try: e1, rel, e2 = line.strip().split(None, 2) except ValueError: continue if _filter_relation(rel): continue if e1 not in entity_map: entity_map[e1] = num_entities num_entities += 1 if e2 not in entity_map: entity_map[e2] = num_entities num_entities += 1 if rel not in relation_map: relation_map[rel] = [[], []] if e1 in seeds: seeds_found.add(e1) if e2 in seeds: seeds_found.add(e2) all_row_ones.append(entity_map[e1]) all_col_ones.append(entity_map[e2]) all_row_ones.append(entity_map[e2]) all_col_ones.append(entity_map[e1]) relation_map[rel][0].append(entity_map[e1]) relation_map[rel][1].append(entity_map[e2]) num_facts += 1 if num_facts == MAX_FACTS: break if not relation_map: return {}, {}, None for rel in relation_map: row_ones, col_ones = relation_map[rel] m = csr_matrix( (np.ones((len(row_ones),)), (np.array(row_ones), np.array(col_ones))), shape=(num_entities, num_entities)) relation_map[rel] = normalize(m, norm="l1", axis=1) if RELATION_WEIGHTING: if rel not in relation_embeddings: score = NOTFOUNDSCORE else: score = np.dot(question_embedding, relation_embeddings[rel]) / ( np.linalg.norm(question_embedding) * np.linalg.norm(relation_embeddings[rel])) relation_map[rel] = relation_map[rel] * np.power(score, EXPONENT) if DECOMPOSE_PPV: adj_mat = sum(relation_map.values()) / len(relation_map) else: adj_mat = csr_matrix( (np.ones((len(all_row_ones),)), (np.array(all_row_ones), np.array(all_col_ones))), shape=(num_entities, num_entities)) return entity_map, relation_map, normalize(adj_mat, norm="l1", axis=1)
def testLUExecution(self): np.random.seed(1) data = np.random.randint(1, 10, (6, 6)) a = tensor(data) P, L, U = lu(a) # check lower and upper triangular matrix result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data) a = tensor(data, chunk_size=2) P, L, U = lu(a) # check lower and upper triangular matrix result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data) a = tensor(data, chunk_size=(2, 3)) P, L, U = lu(a) # check lower and upper triangular matrix result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data) a = tensor(data, chunk_size=4) P, L, U = lu(a) # check lower and upper triangular matrix result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_allclose(res, data) # test for sparse data = sps.csr_matrix([[2, 0, 0, 0, 5, 2], [0, 6, 1, 0, 0, 6], [8, 0, 9, 0, 0, 2], [0, 6, 0, 8, 7, 3], [7, 0, 6, 1, 7, 0], [0, 0, 0, 7, 0, 8]]) a = tensor(data) P, L, U = lu(a) result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] # check lower and upper triangular matrix np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) self.assertIsInstance(result_l, SparseNDArray) self.assertIsInstance(result_u, SparseNDArray) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_almost_equal(data.A, res) a = tensor(data, chunk_size=2) P, L, U = lu(a) result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] # check lower and upper triangular matrix np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) self.assertIsInstance(result_l, SparseNDArray) self.assertIsInstance(result_u, SparseNDArray) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_almost_equal(data.A, res) a = tensor(data, chunk_size=(2, 3)) P, L, U = lu(a) result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] # check lower and upper triangular matrix np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) self.assertIsInstance(result_l, SparseNDArray) self.assertIsInstance(result_u, SparseNDArray) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_almost_equal(data.A, res) a = tensor(data, chunk_size=4) P, L, U = lu(a) result_l = self.executor.execute_tensor(L, concat=True)[0] result_u = self.executor.execute_tensor(U, concat=True)[0] # check lower and upper triangular matrix np.testing.assert_allclose(np.tril(result_l), result_l) np.testing.assert_allclose(np.triu(result_u), result_u) self.assertIsInstance(result_l, SparseNDArray) self.assertIsInstance(result_u, SparseNDArray) t = P.dot(L).dot(U) res = self.executor.execute_tensor(t, concat=True)[0] np.testing.assert_array_almost_equal(data.A, res)
def __init__(self, A): if isinstance(A, str): super().__init__(A) else: A = csr_matrix(A) super().__init__(A.indptr, A.indices, A.data.astype('uint32'), A.get_shape()[1])
def eval(self): if TIME: start = timer() # get held values if they exist if self.buffer == True and not self.disp == "": load = json.loads(self.disp) U = np.array(load) return U # set input sockets input_socket_1 = self.inputs[0] input_socket_2 = self.inputs[1] input_socket_3 = self.inputs[2] input_socket_4 = self.inputs[3] input_socket_5 = self.inputs[4] input_socket_6 = self.inputs[5] input_socket_7 = self.inputs[6] input_socket_8 = self.inputs[7] input_socket_1.set_value(self.E) input_socket_2.set_value(self.v) # get inputs from previous nodes # print("!!!") # print(self.material_input) if self.material_input == "VALUE": E = self.get_value(input_socket_1) v = self.get_value(input_socket_2) elif self.material_input == "NODE": # print("NODE") mat_vect = self.get_value(input_socket_3) # print(mat_vect) E = mat_vect[0] v = mat_vect[2] if self.size_input == "VALUE": input_socket_4.set_value(self.t) t = self.get_value(input_socket_4) else: t = self.get_value(input_socket_5) self.object = self.get_value(input_socket_6) object = self.object ob = object.data # create bmesh environment bm = bmesh.new() bm.from_mesh(self.object.data) # create a matrix of values for edges #new row 1 # column 0 = element number # column 1 = node 1 # column 2 = node 2 # new row 2 # column 0 = modulus of elasticity # column 1 = area # column 2 = G # column 3 = y moment of inertia # column 4 = z moment of inertia # column 5 = J edge_matrix = np.zeros((3), dtype=int) properties = [0,0,0,0,0,0] coormat = np.zeros((9)) new_row_1 = np.zeros((3), dtype=int) new_row_2 = properties i = 0 G = 0 for face in bm.faces: j = 0 coorelement = np.array([]) # print(face.index) for vert in face.verts: # vert = loop.vert coordinates = np.array([vert.co[0], vert.co[1], vert.co[2]]) coorelement = np.hstack([coorelement, coordinates]) new_row_1[j] = vert.index j = j + 1 # print(vert.index, coordinates) new_row_2[0] = E new_row_2[1] = G new_row_2[2] = v new_row_2[3] = t if DEBUG: print(new_row_1,new_row_2) edge_matrix = np.vstack([edge_matrix, new_row_1]) properties = np.vstack([properties, new_row_2]) # print(coorelement) # print(coormat) coormat = np.vstack([coormat, coorelement]) i += 1 edge_matrix = np.delete(edge_matrix, 0, 0) # find better way to initialize (redundant) properties = np.delete(properties, 0, 0) coormat = np.delete(coormat, 0, 0) print(coormat) if DEBUG: print('edge_matrix',edge_matrix) if DEBUG: print('properties',properties) # print(edge_matrix.shape) max = (edge_matrix[:,1:].max() + 1) * 6 if DEBUG: print(max) bm.edges.ensure_lookup_table() # k = np.zeros((12,12)) K=np.zeros((max,max)) Kcst=np.zeros((22, 22)) for e in range(len(edge_matrix)): # print(e) # print("coormat", coormat) # print("1", coormat[e, 0]) k = self.ElementStiffnessMatrix(properties, coormat[e, :], edge_matrix, e) # print(k) K = self.SpaceTrussAssemble(K, k, edge_matrix[e, :]) # Kcst = self.cstassembletest(Kcst, k, edge_matrix[e,:]) print("e", e) K2 = self.test() # for i in range(18): # print("K:", K[i,:]) # print("K2:", K2[i,:]) # for i in range(12): # for j in range(12): # pass # print("k", k) # print("global:") # print(K) # print(edge_matrix) # # create stiffness matrix # k = np.zeros((len(edge_matrix),12,12)) # bm.edges.ensure_lookup_table() # for i in range(len(edge_matrix)): # k[i, :, :]=self.SpaceTrussElementStiffness(properties[i, 0],properties[i, 1],properties[i, 2],properties[i, 3],properties[i, 4],properties[i, 5], bm.edges[i].verts[0].co, bm.edges[i].verts[1].co) # if DEBUG: print(k.shape) # # create global stiffness matrix # if TIME: assem_start = timer() # K=np.zeros((max,max)) # for i in range(len(edge_matrix)): # K=self.SpaceTrussAssemble(K,k[i, :, :],edge_matrix[i,1],edge_matrix[i,2]) # if TIME: assem_end = timer() # print("space truss assemble", assem_end - assem_start) # # print("shape:", K.shape) # # print("K", K) bool = ((self.get_value(input_socket_7))) bool = np.invert(bool) F = self.get_value(input_socket_8) # print("Force:", F) bool = np.ravel(bool) # print("bool after", bool) # print(bool.shape) boolv,boolh = np.ix_(bool, bool) # print(boolv) # print(K.shape) # apply boundary conditions Ksolve = K[boolv,boolh] F = F[boolv] # print(Ksolve) # print(F) if DEBUG: print(F.shape) F= np.reshape(F, (-1,1)) # F=F[1:6,:] print('applying boundary conditions') if DEBUG2: print(Ksolve) # print(F) # solve for displacement Ksolve_csr = sparse.csr_matrix(Ksolve) F_csr = sparse.csr_matrix(F) print('solving') u = scipy.sparse.linalg.spsolve(Ksolve_csr, F_csr) print('solving done') if DEBUG: print(u) bound=np.array([0]) U=np.zeros((max,1)) j = 0 for i in range(len(U)): if bool[i] == 1: U[i] = u[j] j = j + 1 # print("U") # print(U) if DEBUG: print(U) if TIME: end = timer() print("time:", end - start) array = U.tolist() self.disp = json.dumps(array) return U # # caclulate force # F = K.dot(U) # if DEBUG: print(F) # sigma = np.zeros([len(edge_matrix)]) # # calculate stress # for i in range(len(edge_matrix)): # store = properties[i,1] / properties[i,0] * np.array([-properties[i,3], -properties[i,4], -properties[i,5], properties[i,3], properties[i,4], properties[i,5]]) # uvect = np.array([U[3 * edge_matrix[i,1]], U[3 * edge_matrix[i,1] + 1], U[3 * edge_matrix[i,1] + 2], U[3 * edge_matrix[i,2]], U[3 * edge_matrix[i,2] + 1], U[3 * edge_matrix[i,2] + 2]]) # uvect = uvect.reshape(-1,1) # sigma[i] = store.dot(uvect) # if DEBUG: print(uvect.shape) # if DEBUG: print(sigma) # # output colors # # currently using displacement as output because this can be done on verticies # # stress output need to find a way to output color to edges # # vcol_output = bm.vertex_colors.new() # # for v in bm.vertex: # # apply changes # bm.to_mesh(ob) # bm.free() # free and prevent further acess # return object
def load_youtube_data(prefix, ptrain): npz_file = 'data/{}_{}.npz'.format(prefix, ptrain) if os.path.exists(npz_file): start_time = time() print('Found preprocessed dataset {}, loading...'.format(npz_file)) data = np.load(npz_file) num_data = data['num_data'] labels = data['labels'] train_data = data['train_data'] val_data = data['val_data'] test_data = data['test_data'] adj = sp.csr_matrix((data['adj_data'], data['adj_indices'], data['adj_indptr']), shape=data['adj_shape']) feats = sp.csr_matrix((data['feats_data'], data['feats_indices'], data['feats_indptr']), shape=data['feats_shape']) feats1 = sp.csr_matrix((data['feats1_data'], data['feats1_indices'], data['feats1_indptr']), shape=data['feats1_shape']) print('Finished in {} seconds.'.format(time() - start_time)) else: start_time = time() # read edges with open('data/'+prefix+'/edges.csv') as f: links = [link.split(',') for link in f.readlines()] links = [(int(link[0])-1, int(link[1])-1) for link in links] links = np.array(links).astype(np.int32) num_data = np.max(links)+1 adj = sp.csr_matrix((np.ones(links.shape[0], dtype=np.float32), (links[:,0], links[:,1])), shape=(num_data, num_data)) adj = adj + adj.transpose() def _normalize_adj(adj): rowsum = np.array(adj.sum(1)).flatten() d_inv = 1.0 / (rowsum+1e-20) d_mat_inv = sp.diags(d_inv, 0) adj = d_mat_inv.dot(adj) return adj adj = _normalize_adj(adj) feats = sp.eye(num_data, dtype=np.float32).tocsr() feats1 = adj.dot(feats) num_classes = 47 labels = np.zeros((num_data, num_classes), dtype=np.float32) with open('data/'+prefix+'/group-edges.csv') as f: for line in f.readlines(): line = line.split(',') labels[int(line[0])-1, int(line[1])-1] = 1 data = np.nonzero(labels.sum(1))[0].astype(np.int32) np.random.shuffle(data) n_train = int(len(data)*ptrain) train_data = np.copy(data[:n_train]) val_data = np.copy(data[n_train:]) test_data = np.copy(data[n_train:]) num_data, adj, feats, feats1, labels, train_data, val_data, test_data = \ data_augmentation(num_data, adj, adj, feats, labels, train_data, val_data, test_data) print("Done. {} seconds.".format(time()-start_time)) with open(npz_file, 'wb') as fwrite: np.savez(fwrite, num_data=num_data, adj_data=adj.data, adj_indices=adj.indices, adj_indptr=adj.indptr, adj_shape=adj.shape, feats_data=feats.data, feats_indices=feats.indices, feats_indptr=feats.indptr, feats_shape=feats.shape, feats1_data=feats1.data, feats1_indices=feats1.indices, feats1_indptr=feats1.indptr, feats1_shape=feats1.shape, labels=labels, train_data=train_data, val_data=val_data, test_data=test_data) return num_data, adj, feats, feats1, labels, train_data, val_data, test_data
def partition(A, p, alg, *args): A = csr_matrix(A) return alg(A.indptr, A.indices, A.data.astype('uint32'), A.get_shape()[1], p, *args)
def _get_adj(data, coords): adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])), shape=(num_data, num_data)) return adj
# sparse matrix from numpy import array from scipy.sparse import csr_matrix # create dense matrix A = array([ [1, 0, 0, 1, 0, 0], [0, 0, 2, 0, 0, 1], [0, 0, 0, 2, 0, 0]]) print(A) # convert to sparse matrix (CSR method) S = csr_matrix(A) print(S) # reconstruct dense matrix B = S.todense() print(B)
def similarityMatrixTopK(self, item_weights, force_sparse_output=True, k=100, verbose=False, inplace=True): """ The function selects the TopK most similar elements, column-wise :param item_weights: :param force_sparse_output: :param k: :param verbose: :param inplace: Default True, WARNING matrix will be modified :return: """ assert (item_weights.shape[0] == item_weights.shape[1] ), "selectTopK: ItemWeights is not a square matrix" start_time = time.time() if verbose: print("Generating topK matrix") nitems = item_weights.shape[1] k = min(k, nitems) # for each column, keep only the top-k scored items sparse_weights = not isinstance(item_weights, np.ndarray) if not sparse_weights: print("Sorting columns...") idx_sorted = np.argsort(item_weights, axis=0) # sort data inside each column print("Done!") if inplace: W = item_weights else: W = item_weights.copy() # index of the items that don't belong to the top-k similar items of each column not_top_k = idx_sorted[:-k, :] # use numpy fancy indexing to zero-out the values in sim without using a for loop W[not_top_k, np.arange(nitems)] = 0.0 if force_sparse_output: if verbose: print("Starting CSR compression...") W_sparse = sps.csr_matrix(W, shape=(nitems, nitems)) if verbose: print("Sparse TopK matrix generated in {:.2f} seconds". format(time.time() - start_time)) return W_sparse if verbose: print("Dense TopK matrix generated in {:.2f} seconds".format( time.time() - start_time)) return W else: # iterate over each column and keep only the top-k similar items data, rows_indices, cols_indptr = [], [], [] item_weights = check_matrix(item_weights, format='csc', dtype=np.float32) for item_idx in range(nitems): cols_indptr.append(len(data)) start_position = item_weights.indptr[item_idx] end_position = item_weights.indptr[item_idx + 1] column_data = item_weights.data[start_position:end_position] column_row_index = item_weights.indices[ start_position:end_position] non_zero_data = column_data != 0 idx_sorted = np.argsort( column_data[non_zero_data]) # sort by column top_k_idx = idx_sorted[-k:] data.extend(column_data[non_zero_data][top_k_idx]) rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) cols_indptr.append(len(data)) # During testing CSR is faster if verbose: print("Generating CSC matrix...") W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr), shape=(nitems, nitems), dtype=np.float32) if verbose: print("Converting to CSR...") W_sparse = W_sparse.tocsr() if verbose: print("Sparse TopK matrix generated in {:.2f} seconds".format( time.time() - start_time)) return W_sparse
def load_gcn_data(dataset_str): npz_file = 'data/{}_{}.npz'.format(dataset_str, FLAGS.normalization) if os.path.exists(npz_file): start_time = time() print('Found preprocessed dataset {}, loading...'.format(npz_file)) data = np.load(npz_file) num_data = data['num_data'] labels = data['labels'] train_data = data['train_data'] val_data = data['val_data'] test_data = data['test_data'] train_adj = sp.csr_matrix((data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape']) full_adj = sp.csr_matrix((data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape']) feats = sp.csr_matrix((data['feats_data'], data['feats_indices'], data['feats_indptr']), shape=data['feats_shape']) train_feats = sp.csr_matrix((data['train_feats_data'], data['train_feats_indices'], data['train_feats_indptr']), shape=data['train_feats_shape']) test_feats = sp.csr_matrix((data['test_feats_data'], data['test_feats_indices'], data['test_feats_indptr']), shape=data['test_feats_shape']) print('Finished in {} seconds.'.format(time() - start_time)) else: """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) if dataset_str != 'nell': test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() # idx_train = range(len(y)) idx_train = range(18217) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] else: test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) features = allx.tocsr() adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = ally idx_test = test_idx_reorder idx_train = range(len(y)) idx_val = range(len(y), len(y)+969) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] # num_data, (v, coords), feats, labels, train_d, val_d, test_d num_data = features.shape[0] def _normalize_adj(adj): rowsum = np.array(adj.sum(1)).flatten() d_inv = 1.0 / (rowsum+1e-20) d_mat_inv = sp.diags(d_inv, 0) adj = d_mat_inv.dot(adj).tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data.astype(np.float32), coords def gcn_normalize_adj(adj): adj = adj + sp.eye(adj.shape[0]) rowsum = np.array(adj.sum(1)) + 1e-20 d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt, 0) adj = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt) adj = adj.tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data.astype(np.float32), coords # Normalize features rowsum = np.array(features.sum(1)) + 1e-9 r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv, 0) features = r_mat_inv.dot(features) if FLAGS.normalization == 'gcn': full_v, full_coords = gcn_normalize_adj(adj) else: full_v, full_coords = _normalize_adj(adj) full_v = full_v.astype(np.float32) full_coords = full_coords.astype(np.int32) train_v, train_coords = full_v, full_coords labels = (y_train + y_val + y_test).astype(np.float32) train_data = np.nonzero(train_mask)[0].astype(np.int32) val_data = np.nonzero(val_mask)[0].astype(np.int32) test_data = np.nonzero(test_mask)[0].astype(np.int32) feats = (features.data, features.indices, features.indptr, features.shape) def _get_adj(data, coords): adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])), shape=(num_data, num_data)) return adj train_adj = _get_adj(train_v, train_coords) full_adj = _get_adj(full_v, full_coords) feats = sp.csr_matrix((feats[0], feats[1], feats[2]), shape=feats[-1], dtype=np.float32) train_feats = train_adj.dot(feats) test_feats = full_adj.dot(feats) with open(npz_file, 'wb') as fwrite: np.savez(fwrite, num_data=num_data, train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape, full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape, feats_data=feats.data, feats_indices=feats.indices, feats_indptr=feats.indptr, feats_shape=feats.shape, train_feats_data=train_feats.data, train_feats_indices=train_feats.indices, train_feats_indptr=train_feats.indptr, train_feats_shape=train_feats.shape, test_feats_data=test_feats.data, test_feats_indices=test_feats.indices, test_feats_indptr=test_feats.indptr, test_feats_shape=test_feats.shape, labels=labels, train_data=train_data, val_data=val_data, test_data=test_data) return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data
def featurize(movies): """ Append a new column to the movies DataFrame with header 'features'. Each row will contain a csr_matrix of shape (1, num_features). Each entry in this matrix will contain the tf-idf value of the term, as defined in class: tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i)) where: i is a term d is a document (movie) tf(i, d) is the frequency of term i in document d max_k tf(k, d) is the maximum frequency of any term in document d N is the number of documents (movies) df(i) is the number of unique documents containing term i Params: movies...The movies DataFrame Returns: A tuple containing: - The movies DataFrame, which has been modified to include a column named 'features'. - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...}) """ ###TODO #df will be used for calculating tfidf df = dict() tokens_in_movies = movies['tokens'] def unique_tokens(tokens): uni_tokens = set(tokens) return list(uni_tokens) # Creating df for tokens in tokens_in_movies: tokens = unique_tokens(tokens) for token in tokens: if df.__contains__(token): df[token] = df[token] + 1 else: df[token] = 1 vocab_tokens = [key for key in df] vocab_tokens = sorted(vocab_tokens) #Creating vocab from df vocab = dict() col = 0 for term in vocab_tokens: vocab[term] = col col = col + 1 def get_tf(token, tokens): count = 0 for tk in tokens: if tk == token: count = count + 1 return count def get_max_k(tokens): tokenCounts = dict() for token in tokens: if tokenCounts.__contains__(token): tokenCounts[token] = tokenCounts[token] + 1 else: tokenCounts[token] = 1 return sorted(tokenCounts.items(), key=lambda x: -x[1])[0][1] csr_list = [] N = len(tokens_in_movies) indptr = [0] indices = [] data = [] for tokens in tokens_in_movies: for token in tokens: if vocab.__contains__(token): indices.append(vocab[token]) idf = math.log10(N / df[token]) val = float( float(get_tf(token, tokens) / get_max_k(tokens)) * idf) data.append(val) indptr.append(len(indices)) csr_list.append( csr_matrix((data, indices, indptr), shape=(1, len(vocab)))) indptr = [0] indices = [] data = [] movies['features'] = pd.Series(csr_list) return movies, vocab pass
def load_graphsage_data(prefix, normalize=True): ''' version_info = map(int, nx.__version__.split('.')) major = version_info[0] minor = version_info[1] assert (major <= 1) and (minor <= 11), "networkx major version must be <= 1.11 in order to load graphsage data" ''' # Save normalized version if FLAGS.max_degree==-1: npz_file = prefix + '.npz' else: npz_file = '{}_deg{}.npz'.format(prefix, FLAGS.max_degree) if os.path.exists(npz_file): start_time = time() print('Found preprocessed dataset {}, loading...'.format(npz_file)) data = np.load(npz_file) num_data = data['num_data'] feats = data['feats'] train_feats = data['train_feats'] test_feats = data['test_feats'] labels = data['labels'] train_data = data['train_data'] val_data = data['val_data'] test_data = data['test_data'] train_adj = sp.csr_matrix((data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape']) full_adj = sp.csr_matrix((data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape']) print('Finished in {} seconds.'.format(time() - start_time)) else: print('Loading data...') start_time = time() G_data = json.load(open(prefix + "-G.json")) G = json_graph.node_link_graph(G_data) feats = np.load(prefix + "-feats.npy").astype(np.float32) id_map = json.load(open(prefix + "-id_map.json")) if list(id_map.keys())[0].isdigit(): conversion = lambda n: int(n) else: conversion = lambda n: n id_map = {conversion(k):int(v) for k,v in id_map.items()} walks = [] class_map = json.load(open(prefix + "-class_map.json")) if isinstance(list(class_map.values())[0], list): lab_conversion = lambda n : n else: lab_conversion = lambda n : int(n) class_map = {conversion(k): lab_conversion(v) for k,v in class_map.items()} ## Remove all nodes that do not have val/test annotations ## (necessary because of networkx weirdness with the Reddit data) broken_count = 0 to_remove = [] for node in G.nodes(): if not node in id_map: #if not G.node[node].has_key('val') or not G.node[node].has_key('test'): to_remove.append(node) broken_count += 1 for node in to_remove: G.remove_node(node) print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(broken_count)) # Construct adjacency matrix print("Loaded data ({} seconds).. now preprocessing..".format(time()-start_time)) start_time = time() edges = [] for edge in G.edges(): if edge[0] in id_map and edge[1] in id_map: edges.append((id_map[edge[0]], id_map[edge[1]])) print('{} edges'.format(len(edges))) num_data = len(id_map) if FLAGS.max_degree != -1: print('Subsampling edges...') edges = subsample_edges(edges, num_data, FLAGS.max_degree) val_data = np.array([id_map[n] for n in G.nodes() if G.node[n]['val']], dtype=np.int32) test_data = np.array([id_map[n] for n in G.nodes() if G.node[n]['test']], dtype=np.int32) is_train = np.ones((num_data), dtype=np.bool) is_train[val_data] = False is_train[test_data] = False train_data = np.array([n for n in range(num_data) if is_train[n]], dtype=np.int32) train_edges = [(e[0], e[1]) for e in edges if is_train[e[0]] and is_train[e[1]]] edges = np.array(edges, dtype=np.int32) train_edges = np.array(train_edges, dtype=np.int32) # Process labels if isinstance(list(class_map.values())[0], list): num_classes = len(list(class_map.values())[0]) labels = np.zeros((num_data, num_classes), dtype=np.float32) for k in class_map.keys(): labels[id_map[k], :] = np.array(class_map[k]) else: num_classes = len(set(class_map.values())) labels = np.zeros((num_data, num_classes), dtype=np.float32) for k in class_map.keys(): labels[id_map[k], class_map[k]] = 1 if normalize: from sklearn.preprocessing import StandardScaler train_ids = np.array([id_map[n] for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']]) train_feats = feats[train_ids] scaler = StandardScaler() scaler.fit(train_feats) feats = scaler.transform(feats) def _normalize_adj(edges): adj = sp.csr_matrix((np.ones((edges.shape[0]), dtype=np.float32), (edges[:,0], edges[:,1])), shape=(num_data, num_data)) adj += adj.transpose() rowsum = np.array(adj.sum(1)).flatten() d_inv = 1.0 / (rowsum+1e-20) d_mat_inv = sp.diags(d_inv, 0) adj = d_mat_inv.dot(adj).tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data, coords train_v, train_coords = _normalize_adj(train_edges) full_v, full_coords = _normalize_adj(edges) def _get_adj(data, coords): adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])), shape=(num_data, num_data)) return adj train_adj = _get_adj(train_v, train_coords) full_adj = _get_adj(full_v, full_coords) train_feats = train_adj.dot(feats) test_feats = full_adj.dot(feats) print("Done. {} seconds.".format(time()-start_time)) with open(npz_file, 'wb') as fwrite: print('Saving {} edges'.format(full_adj.nnz)) np.savez(fwrite, num_data=num_data, train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape, full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape, feats=feats, train_feats=train_feats, test_feats=test_feats, labels=labels, train_data=train_data, val_data=val_data, test_data=test_data) return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data
def test_concatenate(): # dense data adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno1': ['c1', 'c2']}, {'var_names': ['a', 'b', 'c'], 'annoA': [0, 1, 2]}) adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4']}, {'var_names': ['d', 'c', 'b'], 'annoA': [0, 1, 2]}) adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno2': ['d3', 'd4']}, {'var_names': ['d', 'c', 'b'], 'annoB': [0, 1, 2]}) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.obs_keys() == ['anno1', 'anno2', 'batch'] assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2'] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] adata = adata1.concatenate(adata2, adata3, batch_key='batch1') assert adata.obs_keys() == ['anno1', 'anno2', 'batch1'] adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3']) assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3'] assert adata.var_names.tolist() == ['b', 'c'] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') from numpy import ma Xma = ma.masked_invalid(adata.X) Xma_ref = ma.masked_invalid(np.array([ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0]])) assert np.array_equal(Xma.mask, Xma_ref.mask) assert np.allclose(Xma.compressed(), Xma_ref.compressed()) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid(np.array( [[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0]])) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) # sparse data from scipy.sparse import csr_matrix adata1 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno1': ['c1', 'c2']}, {'var_names': ['a', 'b', 'c']}) adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), {'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4']}, {'var_names': ['d', 'c', 'b']}) adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]), {'obs_names': ['s5', 's6'], 'anno2': ['d3', 'd4']}, {'var_names': ['d', 'c', 'b']}) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0]]
def adjacency_matrix(self): E = self.undirected_edges() vals = np.squeeze(np.ones((len(E), 1))) return sp.csr_matrix((vals, (E[:, 0], E[:, 1])), shape=(self.num_vertices, self.num_vertices))
def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 decimal = 5 if dtype == np.float32 else 7 dtype = np.dtype(dtype) # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0).astype(dtype, copy=False) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # Convert the singular values to the specific dtype U = U.astype(dtype, copy=False) s = s.astype(dtype, copy=False) V = V.astype(dtype, copy=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # If the input dtype is float, then the output dtype is float of the # same bit size (f32 is not upcast to f64) # But if the input dtype is int, the output dtype is float64 if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype == np.float64 assert sa.dtype == np.float64 assert Va.dtype == np.float64 assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k, )) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa, decimal=decimal) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va), decimal=decimal) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype.kind == 'f' assert sa.dtype.kind == 'f' assert Va.dtype.kind == 'f' assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
def load_sparse_csr(filename): loader = np.load(filename) return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
def form_poisson_equation_impl(height, width, alpha, normals, depth_weight, depth): """ For 4-credit students only Creates a Poisson equation given the normals and depth at every pixel in image. The solution to Poisson equation is the estimated depth. When the mode, is 'depth' in 'combine.py', the equation should return the actual depth. When it is 'normals', the equation should integrate the normals to estimate depth. When it is 'both', the equation should weight the contribution from normals and actual depth, using parameter 'depth_weight'. Input: height -- height of input depth,normal array width -- width of input depth,normal array alpha -- stores alpha value of at each pixel of image. If alpha = 0, then the pixel normal/depth should not be taken into consideration for depth estimation normals -- stores the normals(nx,ny,nz) at each pixel of image None if mode is 'depth' in combine.py depth_weight -- parameter to tradeoff between normals and depth when estimation mode is 'both' High weight to normals mean low depth_weight. Giving high weightage to normals will result in smoother surface, but surface may be very different from what the input depthmap shows. depth -- stores the depth at each pixel of image None if mode is 'normals' in combine.py Output: constants for equation of type Ax = b A -- left-hand side coefficient of the Poisson equation note that A can be a very large but sparse matrix so csr_matrix is used to represent it. b -- right-hand side constant of the the Poisson equation """ assert alpha.shape == (height, width) assert normals is None or normals.shape == (height, width, 3) assert depth is None or depth.shape == (height, width) ''' Since A matrix is sparse, instead of filling matrix, we assign values to a non-zero elements only. For each non-zero element in matrix A, if A[i,j] = v, there should be some index k such that, row_ind[k] = i col_ind[k] = j data_arr[k] = v Fill these values accordingly ''' row_ind = [] col_ind = [] data_arr = [] ''' For each row in the system of equation fill the appropriate value for vector b in that row ''' b = [] if depth_weight is None: depth_weight = 1 ''' TODO Create a system of linear equation to estimate depth using normals and crude depth Ax = b x is a vector of depths at each pixel in the image and will have shape (height*width) If mode is 'depth': > Each row in A and b corresponds to an equation at a single pixel > For each pixel k, if pixel k has alpha value zero do not add any new equation. else, fill row in b with depth_weight*depth[k] and fill column k of the corresponding row in A with depth_weight. Justification: Since all the elements except k in a row is zero, this reduces to depth_weight*x[k] = depth_weight*depth[k] you may see that, solving this will give x with values exactly same as the depths, at pixels where alpha is non-zero, then why do we need 'depth_weight' in A and b? The answer to this will become clear when this will be reused in 'both' mode Note: The normals in image are +ve when they are along an +x,+y,-z axes, if seen from camera's viewpoint. If mode is 'normals': > Each row in A and b corresponds to an equation of relationship between adjacent pixels > For each pixel k and its immideate neighbour along x-axis l if any of the pixel k or pixel l has alpha value zero do not add any new equation. else, fill row in b with nx[k] (nx is x-component of normal), fill column k of the corresponding row in A with -nz[k] and column k+1 with value nz[k] > Repeat the above along the y-axis as well, except nx[k] should be -ny[k]. Justification: Assuming the depth to be smooth and almost planar within one pixel width. The normal projected in xz-plane at pixel k is perpendicular to tangent of surface in xz-plane. In other word if n = (nx,ny,-nz), its projection in xz-plane is (nx,nz) and if tangent t = (tx,0,tz), then n.t = 0, therefore nx/-nz = -tz/tx Therefore the depth change with change of one pixel width along x axis should be proporational to tz/tx = -nx/nz In other words (depth[k+1]-depth[k])*nz[k] = nx[k] This is exactly what the equation above represents. The negative sign in ny[k] is because the indexing along the y-axis is opposite of +y direction. If mode is 'both': > Do both of the above steps. Justification: The depth will provide a crude estimate of the actual depth. The normals do the smoothing of depth map This is why 'depth_weight' was used above in 'depth' mode. If the 'depth_weight' is very large, we are going to give preference to input depth map. If the 'depth_weight' is close to zero, we are going to give preference normals. ''' #TODO Block Begin #fill row_ind,col_ind,data_arr,b raise NotImplementedError() #TODO Block end # Convert all the lists to numpy array row_ind = np.array(row_ind, dtype=np.int32) col_ind = np.array(col_ind, dtype=np.int32) data_arr = np.array(data_arr, dtype=np.float32) b = np.array(b, dtype=np.float32) # Create a compressed sparse matrix from indices and values A = csr_matrix((data_arr, (row_ind, col_ind)), shape=(row, width * height)) return A, b
def bow(category, hNeg=True, noun=False): category += '/' # Carrega as reviews negativas e positivas positive = open('sorted_data_acl/' + category + 'positive.review', 'r') negative = open('sorted_data_acl/' + category + 'negative.review', 'r') # Utiliza o BeautifulSoup para ler do xml positive_reviews = (BeautifulSoup(positive, 'lxml')) negative_reviews = (BeautifulSoup(negative, 'lxml')) # Guarda apenas as reviews positive_reviews = positive_reviews.find_all(['review']) negative_reviews = negative_reviews.find_all(['review']) n_pos_reviews = len(positive_reviews) n_neg_reviews = len(negative_reviews) # Inicializa a bag of words, vocabulario e os bigramas bags = [] vocabulary = [] bigrams = [] # Trata as reviews positivas for review in positive_reviews: # Guarda apenas o titulo e o texto de cada review review_text = (review.find('title').string + review.find('review_text').string).lower() # Tokenize review_text = nltk.word_tokenize(review_text) # Chama a funcao que faz o tratamento dos dados review_text = clear_text(review_text, noun) # Guarda as palavras no vocabulario vocabulary.extend(review_text) bag = {} # Conta quantas ocorrencias de cada palavra for word in review_text: if word in bag: bag[word] += 1 else: bag[word] = 1 # Guarda as ocorrencias de cada palavra bags.append(bag) for review in negative_reviews: # Guarda apenas o titulo e o texto de cada review review_text = (review.find('title').string + review.find('review_text').string).lower() # Tokenize review_text = nltk.word_tokenize(review_text) # Chama a funcao que faz o tratamento dos dados review_text = clear_text(review_text, noun) # Chama a funcao para juntar negacoes em bigramas if hNeg: review_text = handle_negation(review_text) # Guarda as palavras no vocabulario vocabulary.extend(review_text) bag = {} # Conta quantas ocorrencias de cada palavra for word in review_text: if word in bag: bag[word] += 1 else: bag[word] = 1 # Guarda as ocorrencias de cada palavra bags.append(bag) n_reviews = n_pos_reviews + n_neg_reviews # sort and get unique words vocabulary = list(set(vocabulary)) # generates matrix where m[i][j] is the number of times the word j appears in document i matrix = np.zeros((n_reviews, len(vocabulary)), dtype="int") # Organiza as 'bags of words' em features e salva na matriz com as colunas correspondentes for i in range(n_reviews): for key in bags[i]: index = vocabulary.index(key) matrix[i][index] = bags[i][key] # make target array target = np.zeros((n_pos_reviews + n_neg_reviews), dtype="int") target[:n_pos_reviews] = 1 #transform matrix in a sparse matrix sMatrix = csr_matrix(matrix) return sMatrix, target, vocabulary