def create_sptensors(self): """ Create a sparse tensor :param : :return: """ tuples = [] # TODO: add edges between timeframes for i, (t, graph) in enumerate(graphs.iteritems()): for u, v in graph.edges_iter(): tuples.append([self.node_pos[u], self.node_pos[v], i]) tuples.append([self.node_pos[v], self.node_pos[u], i]) triplets = np.array(list(set([(u, v, t) for u, v, t in tuples]))) a = sptensor(tuple(triplets.T), vals=np.ones(len(triplets)), shape=(len(self.node_ids), len(self.node_ids), len(graphs))) o_values = [] # FIXME: Tensor O should be columnt normalized sum_rows = np.zeros((a.shape[0], a.shape[2])) for t in range(a.shape[2]): for i in range(a.shape[0]): for j in range(a.shape[1]): # TODO : just add another for loop instead of : to access .sum() # TODO : check sparse tensor performance and library sum_rows[i, t] += a[i, j, t] for i in range(a.shape[0]): if sum_rows[i, t] != 0: for j in range(i): if a[i, j, t] != 0: o_values.append(a[j, i, t] / sum_rows[j, t]) if i != j: o_values.append(a[i, j, t] / sum_rows[i, t]) o = sptensor(tuple(triplets.T), vals=o_values, shape=(len(self.node_ids), len(self.node_ids), len(graphs))) r_values = [] sum_time = np.zeros((a.shape[0], a.shape[1])) for i in range(a.shape[0]): # OPTIMIZE: sum is a dense matrix/array. Should be sparse for memory for j in range(a.shape[1]): for t in range(a.shape[2]): # TODO : just add another for loop instead of : to access .sum() # TODO : check sparse tensor performance and library if a[i, j, t] != 0: sum_time[i, j] += a[i, j, t] for t in range(a.shape[2]): for i in range(a.shape[0]): for j in range(i): if a[j, i, t] != 0: r_values.append(a[j, i, t] / sum_time[j, i]) r_values.append(a[i, j, t] / sum_time[i, j]) r = sptensor(tuple(triplets.T), vals=r_values, shape=(len(self.node_ids), len(self.node_ids), len(graphs))) return a, o, r, sum_rows, sum_time
def build_sparse_B_from_A(A): """ Create the sptensor adjacency tensor of a networkX graph. Parameters ---------- A : list List of MultiDiGraph NetworkX objects. Returns ------- data : sptensor Graph adjacency tensor. data_T : sptensor Graph adjacency tensor (transpose). v_T : ndarray Array with values of entries A[j, i] given non-zero entry (i, j). rw : list List whose elements are reciprocity (considering the weights of the edges) values, one per each layer. """ N = A[0].number_of_nodes() L = len(A) rw = [] d1 = np.array((), dtype='int64') d2, d2_T = np.array((), dtype='int64'), np.array((), dtype='int64') d3, d3_T = np.array((), dtype='int64'), np.array((), dtype='int64') v, vT, v_T = np.array(()), np.array(()), np.array(()) for l in range(L): b = nx.to_scipy_sparse_matrix(A[l]) b_T = nx.to_scipy_sparse_matrix(A[l]).transpose() rw.append(np.sum(b.multiply(b_T))/np.sum(b)) nz = b.nonzero() nz_T = b_T.nonzero() d1 = np.hstack((d1, np.array([l] * len(nz[0])))) d2 = np.hstack((d2, nz[0])) d2_T = np.hstack((d2_T, nz_T[0])) d3 = np.hstack((d3, nz[1])) d3_T = np.hstack((d3_T, nz_T[1])) v = np.hstack((v, np.array([b[i, j] for i, j in zip(*nz)]))) vT = np.hstack((vT, np.array([b_T[i, j] for i, j in zip(*nz_T)]))) v_T = np.hstack((v_T, np.array([b[j, i] for i, j in zip(*nz)]))) subs_ = (d1, d2, d3) subs_T_ = (d1, d2_T, d3_T) data = skt.sptensor(subs_, v, shape=(L, N, N), dtype=v.dtype) data_T = skt.sptensor(subs_T_, vT, shape=(L, N, N), dtype=vT.dtype) return data, data_T, v_T, rw
def update_expec(self, data, avgs): if isinstance(data, skt.dtensor): ind = data.nonzero() non_zero_ent = data[ind] elif isinstance(data, skt.sptensor): ind = data.subs non_zero_ent = data.vals size_ind = ind[0].size log_a = np.ones((size_ind, self.comp)) for m in xrange(self.modes): log_a *= avgs[m][ind[m], :] log_a = np.log(log_a.sum(axis=1)) q = np.empty((size_ind, self.n_trunc), dtype=np.float64) for i in range(1, self.n_trunc + 1): q[:, i - 1] = (-i * self.lam) + (non_zero_ent - i) * np.log(i) + i * log_a + i - 1 norm = logsumexp(q, axis=1) q = np.exp(q - norm[:, np.newaxis]) self.expec = np.zeros((size_ind, )) for i in range(1, self.n_trunc + 1): self.expec += i * q[:, i - 1] self.expec = skt.sptensor(ind, self.expec, shape=self.dim, dtype=np.float64)
def build_sparse_B_from_A(A): """ Create the sptensor adjacency tensor of a networkX graph. Parameters ---------- A : list List of MultiDiGraph NetworkX objects. Returns ------- data : sptensor Graph adjacency tensor. """ N = A[0].number_of_nodes() L = len(A) d1 = np.array((), dtype='int64') d2 = np.array((), dtype='int64') d3 = np.array((), dtype='int64') v = np.array(()) for l in range(L): b = nx.to_scipy_sparse_matrix(A[l]) nz = b.nonzero() d1 = np.hstack((d1, np.array([l] * len(nz[0])))) d2 = np.hstack((d2, nz[0])) d3 = np.hstack((d3, nz[1])) v = np.hstack((v, np.array([b[i, j] for i, j in zip(*nz)]))) subs_ = (d1, d2, d3) data = skt.sptensor(subs_, v, shape=(L, N, N), dtype=v.dtype) return data
def __init__(self, subs, vals, shape=None, dtype=int, accumfun=sum.__call__): if len(vals) <= 0: ValueError("the input tensor is ZERO!") subs = np.asarray(subs) ns, ndims = subs.shape self._dimsmin_ = np.min(subs, 0) self._dimsmap_ = list() for d in range(ndims): undim = np.unique(subs[:, d]) self._dimsmap_.append(dict(zip(undim, range(len(undim))))) nwsubs = list() for k in range(ns): term = list() for d in range(ndims): term.append(self._dimsmap_[d][subs[k, d]]) nwsubs.append(np.asarray(term)) tensor = sptensor(tuple(np.asarray(nwsubs).T), np.asarray(vals), shape, dtype, accumfun=accumfun) self.data = dict(zip(map(tuple, np.asarray(tensor.subs).T), tensor.vals)) self.shape = tensor.shape self.ndim = tensor.ndim self.nnz = tensor.nnz() self.vals = np.sum(self.data.values())
def test_spttv(subs, vals, shape): S = sptensor(subs, vals, shape=shape) K = ktensor([ np.random.randn(shape[0], 2), np.random.randn(shape[1], 2), np.random.randn(shape[2], 2) ]) K.innerprod(S)
def test_spttv(): subs = ( array([0, 1, 0, 5, 7, 8]), array([2, 0, 4, 5, 3, 9]), array([0, 1, 2, 2, 1, 0]) ) vals = array([1, 1, 1, 1, 1, 1]) S = sptensor(subs, vals, shape=[10, 10, 3]) K = ktensor([randn(10, 2), randn(10, 2), randn(3, 2)]) K.innerprod(S)
def test_spttv(): # subs = ( # array([0, 1, 0, 5, 7, 8]), # array([2, 0, 4, 5, 3, 9]), # array([0, 1, 2, 2, 1, 0]) # ) # vals = array([1, 1, 1, 1, 1, 1]) S = sptensor(subs, vals, shape=shape) K = ktensor([randn(shape[0], 2), randn(shape[1], 2), randn(shape[2], 2)]) K.innerprod(S)
def test_spttv(): #subs = ( # array([0, 1, 0, 5, 7, 8]), # array([2, 0, 4, 5, 3, 9]), # array([0, 1, 2, 2, 1, 0]) #) #vals = array([1, 1, 1, 1, 1, 1]) S = sptensor(subs, vals, shape=shape) K = ktensor([randn(shape[0], 2), randn(shape[1], 2), randn(shape[2], 2)]) K.innerprod(S)
def _update_theta_gamma(self, m): subs_I_M = np.where(self.y_E_DIMS > 1e-4) y_spt_DIMS = skt.sptensor(subs_I_M, self.y_E_DIMS[subs_I_M], shape=self.y_E_DIMS.shape, dtype=float) tmp_DIMS = y_spt_DIMS.vals / self._reconstruct_nz(y_spt_DIMS.subs) uttkrp_nonzero_DK = sp_uttkrp(tmp_DIMS, y_spt_DIMS.subs, m, self.theta_G_DK_M) self.theta_shp_DK_M[ m][:, :] = self.alpha + self.theta_G_DK_M[m] * uttkrp_nonzero_DK
def assignBlock(self, i, s, X,Y, Z1, Z2, S, tensor_dim_ceiling, subs_idx, num_workers, tensor_dim_size): _dict = {} num_ways = len(tensor_dim_ceiling) strata_index = [int(math.floor(i + sum([float(s) / num_workers**way_index for way_index in range(way_index+1)]))) % num_workers for way_index in range(num_ways)] strata_range = [range(int(math.ceil(strata_index[way_index] * tensor_dim_ceiling[way_index])), int(math.ceil((strata_index[way_index]+1) * tensor_dim_ceiling[way_index]))) for way_index in range(num_ways)] strata_range = [[o for o in each_range if o < tensor_dim_size[index]] for index, each_range in enumerate(strata_range)] strata_range = [range(each_range[0], (each_range[-1] + 1)) for each_range in strata_range] total_nb_points = len(subs_idx.value) subs = [idx for idx in subs_idx.value if all([idx[way_index] in strata_range[way_index] for way_index in range(num_ways)])] subs_x = [tuple(idx) for idx in subs_idx.value if all([idx[way_index] in strata_range[way_index] for way_index in range(num_ways)])] X_vals = [] Y_vals = [] ZX_vals = [] ZY_vals = [] S_vals = [] if len(subs_x) > 0: for i in range(len(subs_x)): tensor_index = tuple(np.array(subs_x[i]).T) X_vals.append(X[tensor_index][0]) Y_vals.append(Y[tensor_index][0]) ZX_vals.append(Z1[tensor_index][0]) ZY_vals.append(Z2[tensor_index][0]) S_vals.append(S[tensor_index][0]) X_subs = sptensor(tuple(np.array(subs_x).T), X_vals,shape=tensor_dim_size, dtype=np.float) Y_subs = sptensor(tuple(np.array(subs_x).T), Y_vals,shape=tensor_dim_size, dtype=np.float) ZX_subs = sptensor(tuple(np.array(subs_x).T), ZX_vals,shape=tensor_dim_size, dtype=np.float) ZY_subs = sptensor(tuple(np.array(subs_x).T), ZY_vals,shape=tensor_dim_size, dtype=np.float) S_subs = sptensor(tuple(np.array(subs_x).T), S_vals,shape=tensor_dim_size, dtype=np.float) _dict['ratio'] = len(subs_x) / float(total_nb_points) _dict['X_subs'] = X_subs _dict['Y_subs'] = Y_subs _dict['ZX_subs'] = ZX_subs _dict['ZY_subs'] = ZY_subs _dict['S_subs'] = S_subs _dict['subs'] = subs return _dict else: return None
def load_sptensor(fp, start_index=1, dtype=None): nmodes = int(fp.readline()) ndims = tuple(map(int, fp.readline().split())) assert nmodes == len(ndims) subs = tuple(([] for m in ndims)) vals = [] while True: line = fp.readline() if not line: break linesep = line.split() for m, x in enumerate(linesep[:-1]): subs[m].append(int(x) - start_index) vals.append((dtype or float)(linesep[-1])) return sktensor.sptensor(subs, vals, shape=ndims, dtype=dtype)
def create_sptensor(self, graphs): """ Create a sparse tensor :param graphs: :return: """ tuples = [] # triplets = np.array([(u, v, t) for t in range(1, len(graphs)+1) for u, v in graphs[i].edges_iter()] + # [(v, u, t) for t in range(1, len(graphs)+1) for u, v in graphs[i].edges_iter()]) for i, graph in graphs.iteritems(): for u, v in graph.edges_iter(): tuples.append([self.node_pos[u], self.node_pos[v], i-1]) tuples.append([self.node_pos[v], self.node_pos[u], i-1]) triplets = np.array([(u, v, t) for u, v, t in tuples]) T = sptensor(tuple(triplets.T), vals=np.ones(len(triplets)), shape=(len(self.node_ids), len(self.node_ids), len(graphs))) return T
def get_tensor(middle_end='sktensor', cutoff=10): logging.info('Reweighting: log') verb_tensor_path = os.path.join( projdir, '{}/tensor_{}.pkl'.format(middle_end, cutoff)) if os.path.exists(verb_tensor_path): logging.info('Loading tensor from {}'.format(verb_tensor_path)) tensor, indices = pickle.load(open(verb_tensor_path, mode='rb')) logging.debug(tensor.shape) return tensor, indices occurrence, marginals = mazsola_reader() def get_index(freq_dict): items = sorted(filter(lambda item: item[1] >= cutoff, freq_dict.items()), key=operator.itemgetter(1), reverse=True) logging.debug(items[-3:]) return dict([(w, i) for i, (w, f) in enumerate(items)]) coords, data = ([], [], []), [] indices = [get_index(fd) for fd in marginals] logging.info('Building tensor...') logging.info(' Pupulating lists...') for i, ((svo), freq) in enumerate(occurrence.items()): if not i % 2000000: logging.debug(' {:,}'.format(i)) #'{} {}'.format(svo[1], freq)) for i, word in enumerate(svo): if svo[i] not in indices[i]: break else: for i, word in enumerate(svo): coords[i].append(indices[i][svo[i]]) data.append(np.log(freq)) logging.info(' Creating array') shape = tuple(map(len, indices)) logging.info(shape) if middle_end == 'tensorly': tensor = sparse.COO(coords, data, shape=shape) #, has_duplicates=False) elif middle_end == 'sktensor': tensor = sktensor.sptensor(coords, data, shape=shape) else: raise NotImplementedError pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb')) logging.info(tensor) return tensor, indices
def sptensor_from_dense_array(X): """ Create an sptensor from a ndarray or dtensor. Parameters ---------- X : ndarray Input data. Returns ------- sptensor from a ndarray or dtensor. """ subs = X.nonzero() vals = X[subs] return skt.sptensor(subs, vals, shape=X.shape, dtype=X.dtype)
def mach(X, ranks, p): """ Implementation of MACH prposed in C. E. Tsourakakis. Mach: Fast randomized tensor decompositions. In ICDM, pages 689–700, 2010. """ prod_ns = np.prod(X.shape) indn = np.random.choice(prod_ns, int(prod_ns * p), replace=False) multinds = np.unravel_index(indn, X.shape) X_sp = st.sptensor(multinds, (1 / p) * X[multinds], shape=X.shape) ### for sparse eigen decomposition (scipy linalg problem) _ranks = np.array(ranks) _shape = np.array(X.shape) _ind = _ranks >= _shape _ranks[_ind] = _shape[_ind] - 1 return st.tucker_hooi(X_sp, _ranks.tolist(), init='nvecs')
def _init_data(self, data, mask=None): if isinstance(data, np.ndarray): data = skt.sptensor(data.nonzero(), data[data.nonzero()], data.shape) assert isinstance(data, skt.sptensor) assert data.ndim == 4 assert data.shape[0] == data.shape[1] V, A, T = data.shape[1:] self.n_actors = V self.n_actions = A self.n_timesteps = T if mask is not None: assert isinstance(mask, np.ndarray) assert (mask.ndim == 2) or (mask.ndim == 3) assert mask.shape[-2:] == (V, V) assert np.issubdtype(mask.dtype, np.integer) return data
points_c = [] vals = [] for k in range(len(predicates)): tups = dictTrp1[k] for tup in tups: points_a.append(tup[0]) points_b.append(tup[1]) points_c.append(k) vals.append(tup[2]) L = [] L.append(points_a) L.append(points_b) L.append(points_c) X1 = sptensor(tuple(L), vals, shape=(I, J, len(predicates)), dtype=float) points_a = [] points_b = [] points_c = [] vals = [] for k in range(len(predicates)): tups = dictTrp2[k] for tup in tups: points_a.append(tup[0]) points_b.append(tup[1]) points_c.append(k) vals.append(tup[2]) L = []
def get_tensor(self, log=True, divide_by_marginal=False): def get_index(freq_dict): items = sorted(filter(lambda item: item[1] >= self.cutoff, freq_dict.items()), key=operator.itemgetter(1), reverse=True) logging.debug(items[-3:]) return dict([(w, i) for i, (w, f) in enumerate(items)]) verb_tensor_path = os.path.join(self.projdir, '{}_{}.pkl').format( 'pmi' if divide_by_marginal else 'logfreq', # TODO self.cutoff) if False: #os.path.exists(verb_tensor_path): logging.info('Loading tensor from {}'.format(verb_tensor_path)) tensor, indices = pickle.load(open(verb_tensor_path, mode='rb')) logging.debug(tensor.shape) return tensor, indices occurrence, marginals = self.mazsola_reader() coords, data = tuple([] for _ in range(self.ndim)), [] indices = [get_index(fd) for fd in marginals] logging.info('Building tensor...') logging.info(' Pupulating lists...') total = 0 for i, ((svo), freq) in enumerate(occurrence.items()): if not i % 2000000: logging.debug(' {:,}'.format(i)) for j, word in enumerate(svo): if svo[j] not in indices[j]: break else: for i, word in enumerate(svo): coords[i].append(indices[i][svo[i]]) to_debug = (coords[0][-1] == 1 and coords[1][-1] == 1 and coords[2][-1] == 0) #if not pmi: # TODO freq += 1 if to_debug: logging.debug(freq) total += freq if to_debug: logging.debug(freq) if divide_by_marginal: # TODO PPMI for i in range(self.ndim): freq /= marginals[i][svo[i]] if to_debug: logging.debug((marginals[i][svo[i]], freq)) if log: freq = np.log(freq) if to_debug: logging.debug(freq) data.append(freq) logging.info(' Total: {}'.format(total)) logging.info(' Creating array') shape = tuple(map(len, indices)) logging.info(shape) data = np.array(data) if divide_by_marginal: if log: data += 2 * np.log(total) else: data *= total**2 tensor = sktensor.sptensor(coords, data, shape=shape) #pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb')) logging.info(tensor) return tensor, indices
import numpy.random as rn import cPickle as pickle import numpy as np import sktensor as skt data = rn.poisson(0.2, size=(25, 25, 100)) # 3-mode SPARSE count tensor of size 10 x 8 x 3 subs = data.nonzero() # subscripts where the ndarray has non-zero entries vals = data[data.nonzero()] # corresponding values of non-zero entries sp_data = skt.sptensor(subs, # create an sktensor.sptensor vals, shape=data.shape, dtype=data.dtype) with open('data.dat', 'w+') as f: # can be stored as a .dat using pickle pickle.dump(sp_data, f) with open('data.dat', 'r') as f: # can be loaded back in using pickle.load tmp = pickle.load(f) assert np.allclose(tmp.vals, sp_data.vals)
def tosptensor(self): return sptensor(tuple(np.asarray(self.data.keys()).T), np.asarray(self.data.values()), self.shape)
def HOALS(data,dims,ranks,model='tucker',lambda_=0.8,alpha=0.1,num_iters=5,implicit=False): """ Parameters data : DataFrame [0] : userId [1] : itemId [2] : actionId [3] : rating dims : list [0] : number of users [1] : number of items [2] : number of actions """ data.columns = np.arange(data.shape[1]) C_train = sktensor.sptensor((data[2],data[0],data[1]), data[3],shape=(dims[2],dims[0],dims[1])) #============================================================================== # recuparation of the (user,item,rate) of the unfold matrix #============================================================================== # train set C1 = sktensor.csr_matrix(C_train.unfold(1)) y1 = list(C1.indices) indptr1 = C1.indptr r1 = list(C1.data) tmp1 = indptr1[1:len(indptr1)]-indptr1[0:(len(indptr1)-1)] x1 = [] for i in np.arange(len(tmp1)): x1.extend(np.repeat(i,tmp1[i])) C2 = sktensor.csr_matrix(C_train.unfold(2)) y2 = list(C2.indices) indptr2 = C2.indptr r2 = list(C2.data) tmp2 = indptr2[1:len(indptr2)]-indptr2[0:(len(indptr2)-1)] x2 = [] for i in np.arange(len(tmp2)): x2.extend(np.repeat(i,tmp2[i])) C3 = sktensor.csr_matrix(C_train.unfold(0)) y3 = list(C3.indices) indptr3 = C3.indptr r3 = list(C3.data) tmp3 = indptr3[1:len(indptr3)]-indptr3[0:(len(indptr3)-1)] x3 = [] for i in np.arange(len(tmp3)): x3.extend(np.repeat(i,tmp3[i])) dataTrain = {} dataTrain[0] = pd.DataFrame([x1,y1,r1]).T dataTrain[1] = pd.DataFrame([x2,y2,r2]).T dataTrain[2] = pd.DataFrame([x3,y3,r3]).T dataTrain[0] = dataTrain[0][dataTrain[0][2]!=0] # where the rating is not null dataTrain[1] = dataTrain[1][dataTrain[1][2]!=0] dataTrain[2] = dataTrain[2][dataTrain[2][2]!=0] #============================================================================== # Factorization #============================================================================== ratings = {} res = {} features = {} times = [] for i in range(3): if i==0: mode = 'User' elif i==1: mode = 'Item' elif i==2: mode = 'Action' print("Start "+mode+" Learning") dataTrain[i] = sqlContext.createDataFrame(dataTrain[i]).rdd ratings[i] = dataTrain[i].map(lambda l: Rating(float(l[0]), float(l[1]), float(l[2]))) #ratings[i] = dataTrain[i].map(lambda l: array([float(l[0]), float(l[1]), float(l[2])])) # Build the recommendation model using Alternating Least Squares t0 = time.time() if implicit: res[i] = ALS.trainImplicit(ratings=ratings[i], rank=ranks[i], iterations=num_iters, seed=0, lambda_=lambda_, alpha=alpha) else: res[i] = ALS.train(ratings=ratings[i], rank=ranks[i], iterations=num_iters, seed=0, lambda_=lambda_) t1 = time.time() delta = t1-t0 print('time :',delta) times.append(delta) #features[i] = res[i].userFeatures() print('longest mode time :',np.max(times))
subs_1 = np.append(data[:,:2], np.zeros((n, 1)), 1) subs_2 = np.append(data[:,:2], np.ones((n, 1)), 1) subs = np.vstack([subs_1, subs_2]) subs = subs.astype(int) vals = np.hstack([data[:,2], data[:, 3]]) vals = vals.flatten() # convert subs tuple of arrays (rows, cols, tubes) subs = (subs[:,0], subs[:,1], subs[:,2]) # load into sparse tensor T = sptensor(subs, vals) logging.debug("Starting Tucker decomposition") #T = loadmat('../datasets/alyawarra/alyawarradata.mat')['Rs'] #X = [lil_matrix(T[:, :, k]) for k in range(T.shape[2])] #X = [lil_matrix(T[:, :, k]) for k in range(T.shape[2])] # Decompose tensor using RESCAL-ALS P = tucker_hooi(T, [10, 10, 2], init='random') logging.debug("Finished tucker decomposition")
def main(n_top_words, alpha, beta, rank, priv, n_iters=200): # output_data_shape = (n_docs, n_words) # theta_DK = np.random.gamma(alpha, beta, (n_docs, rank)) # phi_KV = np.random.gamma(alpha, beta, (rank, n_words)) # poisson_priors_DV = parafac((theta_DK, phi_KV.T)) # data_DV = np.random.poisson(poisson_priors_DV, output_data_shape) with np.load('sotu_years.npz') as dat_file: data_DV = dat_file['Y_DV'] vocab = dat_file['types_V'] n_docs, n_words = data_DV.shape bpptf_model = BPPTF(n_modes=2, n_components=rank, verbose=True, max_iter=1) bptf_model = BPTF(n_modes=2, n_components=rank, verbose=True, max_iter=1) # initialize both models modes = (0, 1) data_usable = preprocess(data_DV) if isinstance(data_usable, skt.dtensor): bpptf_model.data_DIMS = data_usable.copy() else: bpptf_model.data_DIMS = skt.sptensor( tuple((np.copy(ds) for ds in data_usable.subs)), data_usable.vals.copy()) bpptf_model._init_all_components(data_usable.shape) bptf_model._init_all_components(data_usable.shape) bpptf_model.y_E_DIMS = data_usable if isinstance(data_usable, skt.sptensor): bpptf_model.y_E_DIMS = bpptf_model.y_E_DIMS.toarray() for i in range(n_iters): print i for m in modes: # check_equal(bpptf_model, bptf_model, m) bpptf_model._update_theta_gamma(m) bptf_model._update_gamma(m, data_usable) # check_equal(bpptf_model, bptf_model, m) bpptf_model._update_theta_delta(m, None) bptf_model._update_delta(m, None) # check_equal(bpptf_model, bptf_model, m) bpptf_model._update_cache(m) bptf_model._update_cache(m) # check_equal(bpptf_model, bptf_model, m) bpptf_model._update_beta(m) # must come after cache update! bptf_model._update_beta(m) # check_equal(bpptf_model, bptf_model, m) bpptf_model._check_component(m) bptf_model._check_component(m) # check_equal(bpptf_model, bptf_model, m) print "Old topics" new_phi = bptf_model.E_DK_M[1].T top_words = np.argpartition(new_phi, n_words - n_top_words)[:, -n_top_words:] for topic in xrange(rank): top_word_vals = zip(-new_phi[topic, top_words[topic]], vocab[top_words[topic]]) print topic, ' '.join( ['{}'.format(wd) for (_, wd) in sorted(top_word_vals)]) print "\nNew topics" new_phi = bpptf_model.theta_E_DK_M[1].T top_words = np.argpartition(new_phi, n_words - n_top_words)[:, -n_top_words:] for topic in xrange(rank): top_word_vals = zip(-new_phi[topic, top_words[topic]], vocab[top_words[topic]]) print topic, ' '.join( ['{}'.format(wd) for (_, wd) in sorted(top_word_vals)])
def sptensor_from_dense_array(X): """Creates an sptensor from an ndarray or dtensor.""" subs = X.nonzero() vals = X[subs] return skt.sptensor(subs, vals, shape=X.shape, dtype=X.dtype)
def fromarray(A): """Create a sptensor from a dense numpy array""" subs = np.nonzero(A) vals = A[subs] return sptensor(subs, vals, shape=A.shape, dtype=A.dtype)
# Generate a synthetic toy data set true_A_IK = prg(alpha, lambd, size=(n_genes, n_feats)) # synthetic genes x feats matrix true_P_JK = prg(alpha, lambd, size=(n_cells, n_feats)) # synthetic cells x feats matrix true_M_IJ = true_A_IK.dot(true_P_JK.T) # synthetic mean of observed counts true_Y_IJ = np.zeros_like(true_M_IJ, dtype=int) # synthetic observed counts true_Y_IJ[true_M_IJ > 0] = rn.poisson(true_M_IJ[true_M_IJ > 0]) subs = true_Y_IJ.nonzero() # subscripts where the ndarray has non-zero entries vals = true_Y_IJ[ true_Y_IJ.nonzero()] # corresponding values of non-zero entries sp_data = skt.sptensor( subs, # create an sktensor.sptensor vals, shape=true_Y_IJ.shape, dtype=true_Y_IJ.dtype) sns.heatmap(true_Y_IJ, cmap='Blues') plt.show() model = PRGPMF(n_genes=n_genes, n_cells=n_cells, n_feats=n_feats, alpha=alpha, lambd=lambd, seed=seed, n_threads=n_threads) n_samples = 100 # how many posterior samples to collect
def RDFParsing(self,path,isSparse): #parse RDF g=rdflib.Graph() g.parse(file=open(path, "r"), format="application/rdf+xml") entities = [] predicates = [] for s in g.subjects(None, None): ss=s.encode('utf-8').__str__() if ss not in entities: entities.extend([ss]) for o in g.objects(None, None): oo=o.encode('utf-8').__str__() if type(o).__name__ != "Literal" : if oo not in entities: entities.extend([oo]) for p in g.predicates(None, None): pp=p.encode('utf-8').__str__() if pp not in predicates: predicates.extend([pp]) entities = np.array(entities) #print entities logging.warning( "*************************" ) predicates = np.array(predicates) #print predicates #prepare tensor frontal slices as np matrices lenentities = len(entities) lenpredicates = len(predicates) logging.warning("Tensor: "+str(lenentities)+" X "+str(lenentities)+" X "+str(lenpredicates)) T = np.zeros((lenentities, lenentities, lenpredicates),dtype=np.int) print "entities".join(map(str, entities)) for s,p,o in g: try: logging.warning("try to build the tensor") ss=s.__str__().encode('utf-8') oo=o.__str__().encode('utf-8') pp=p.__str__().encode('utf-8') i,j,k=entities.tolist().index(ss),entities.tolist().index(oo),predicates.tolist().index(pp) #logging.warning("ijk".join(map(str, [i,j,k]))) #print "ijk".join(map(str, [i,j,k])) value=1.0 if "__" in p: #print p.split("__")[len(p.split("__"))-1] value=float(p.split("__")[len(p.split("__"))-1]) #print "value"+value T[i, j, k] = value except: #printException() continue if isSparse: xyz, c = self.sparsed(T) logging.warning("xyz: ") logging.warning("-".join(map(str, xyz))) #logging.warning("c: ".join(c)) Tensor = sptensor(xyz, c, shape=(lenentities, lenentities, lenpredicates), dtype=np.int) else: Tensor = dtensor(T) #print "*************************" #print Tensor[:,:,slice] return Tensor,g,entities,predicates
def run_case_study(): iter_cnt = 10 nb_trial = 3 alg_names = [PAIRFAC] # alg_names = [SDCDT] iters = [iter_cnt] * len(alg_names) if alg_names[0].__name__ in ["SDCDT"]: distance = int(sys.argv[1]) alpha = float(sys.argv[2]) beta = float(sys.argv[3]) alpha_pars = [alpha] beta_pars = [beta] gamma_pars = [1e+0] delta_pars = [1e+0] else: distance = 3 alpha = float(sys.argv[1]) beta = float(sys.argv[2]) gamma = float(sys.argv[3]) alpha_pars = [alpha] beta_pars = [beta] gamma_pars = [gamma] delta_pars = [1e-8] case_study = "ha_{}mp4d".format(str(sys.argv[4])) case_study = "wpi_{}mp4d".format(str(sys.argv[4])) sub_dir = "classification" num_workers = 2 nb_points = 40000 # nyc train_proportions = [10] layers = [0] _type = "function" if case_study.startswith("wpi_"): nb_points = 4000 dims = [10, 59, 2] dims = [266, 59, 4, 10] if case_study.endswith("4d"): dims = [376, 58, 7, 10] if "5" in case_study: dims = [266, 59, 4, 10] if "6" in case_study: dims = [408, 68, 4, 10] domain = "math" source1 = "good" source2 = "bad" R_set = {0: 6} R_check = 6 if case_study.startswith("ha_"): nb_points = 4000 dims = [5, 34, 6, 54] domain = "mooc" source1 = "good" source2 = "bad" R_set = {0: 6} R_check = 6 k = domain + "_" + case_study bootstrap_seed_list = [0, 1, 2, 3, 4] for bootstrap_seed in bootstrap_seed_list: _log.info("distance:{}".format(distance)) _log.info("bootstrap_seed:{}".format(bootstrap_seed)) from datetime import datetime idx_list, value_list_X, value_list_Y, value_list_ZX, value_list_ZY, value_list_S = read_domain_data( source1, source2, dims, domain, _type, case_study, k, nb_points, bootstrap_seed=bootstrap_seed) X = sptensor(tuple(np.asarray(idx_list).T), value_list_X, shape=dims, dtype=np.float) Y = sptensor(tuple(np.asarray(idx_list).T), value_list_Y, shape=dims, dtype=np.float) conf = SparkConf().setAppName("PairFac...") sc = SparkContext(conf=conf) Lambda_all = [ alpha_pars, beta_pars, gamma_pars, delta_pars, train_proportions ] Lambda_all = list(itertools.product(*Lambda_all)) len_paraset = len(Lambda_all) cur_para_index = 0 train_cur_proportion = float(10) test_portion = (10.0 - train_cur_proportion) / 2.0 np.random.seed(2) Weight = np.random.choice( [0, 1, 2, 3], size=(len(value_list_ZX), ), p=[ test_portion / 10, test_portion / 10, train_cur_proportion / 10, (10.0 - train_cur_proportion - test_portion * 2) / 10 ]) train_index_1 = train_index_2 = np.where(Weight == 2) validation_index_1 = validation_index_2 = np.where(Weight == 1) test_index_1 = test_index_2 = np.where(Weight == 0) X_train = construct_tensor(value_list_X, train_index_1, idx_list, dims) Y_train = construct_tensor(value_list_Y, train_index_1, idx_list, dims) ZX_train = construct_tensor(value_list_ZX, train_index_1, idx_list, dims) ZY_train = construct_tensor(value_list_ZY, train_index_1, idx_list, dims) S_train = construct_tensor(value_list_S, train_index_1, idx_list, dims) non_zero_idxs = np.asarray(idx_list)[train_index_1[0]] D_matrix = np.zeros((X_train.shape[0], X_train.shape[0])) W_matrix = np.zeros((X_train.shape[0], X_train.shape[0])) for alg_name, iter_cnt in zip(alg_names, iters): for each_lambda in Lambda_all: progress = cur_para_index * 1.0 / len_paraset cur_para_index += 1 _log.info('[{}] Running {}...'.format(alg_names[0].__name__, progress)) alg = alg_name() Lambda = list(each_lambda) cur_paras = '_'.join([str(x) for x in each_lambda]) fname = 'weight_s_t_2_{}_layer_0_distance_{}_seed_{}_R_{}'.format( alg.__class__.__name__, distance, bootstrap_seed, R_check) directory_ = PROJECT_DIR + "/output/output_" + k + "_" + sub_dir + "" if not os.path.exists(directory_): os.makedirs(directory_) layer_fileName = directory_ + "/weights/" + str( cur_paras) + "/" + str(fname) embeddings_dir = directory_ + "/embeddings/" + str( alg_names[0].__name__) + "/" + str( bootstrap_seed) + "/" + cur_paras if not os.path.exists(embeddings_dir): os.makedirs(embeddings_dir) weights_dir = directory_ + "/weights/" + cur_paras if not os.path.exists(weights_dir): os.makedirs(weights_dir) if os.path.exists(layer_fileName): _log.info("{} exists".format(layer_fileName)) continue alg.run_multi_trials( sc, X_train, Y_train, ZX_train, ZY_train, S_train, k,k, Lambda, D_matrix, W_matrix, \ num_trial=nb_trial, max_iter=iter_cnt, verbose=2, noise=0.01,nb_points=nb_points, non_zero_idxs=non_zero_idxs, num_workers=num_workers,distance=distance, R_set = R_set,layers = layers,bootstrap_seed = bootstrap_seed) gc.collect() sc.stop()
def NTF_sampling_24H(sampling, df_classified, flags, start, stop, path, vectorizer_new=0, vectorizer_s=0, n_topics=10, n_features=1000, matlab=False, monuments=False): # Feed the vectorizer with all the words in the dataset. Counts is the tweet/term matrix. # fit_transform: fit first (build the features list with the relevant words) # then transform: build the tweet/term matrix with the relevant tokens. if not vectorizer_new: print 'No vectorizer defined. Returning None' return None if matlab: name_matlab = path + 'matlab/TorInst{}Matr'.format(n_features) Coord_CRS_global = [] Data_CRS_global = [] Ncells = [] snapshots = df_classified.columns.tolist()[0:-1] if start >= stop: print 'incorrect start and/or stop dates. performing NTF on whole passed dataset' stop = min(len(flags), len(snapshots)) #For every snapshot taken ct = 0 for month in snapshots: ct += 1 print flags[snapshots.index(month)] This_Month = df_classified[month].tolist() #print len(list_reviews_rest), 'tagged cells for ', year_month # Learn the vocabulary dictionary and return term-document matrix. print len(This_Month) counts = vectorizer_new.transform(This_Month) #Transform a count matrix to a normalized tf-idf representation. #(i.e terms with frequencies too hi or lo are removed) # Weights are indexed by (postID, term): weight tfidf = TfidfTransformer().fit_transform(counts) if matlab: savemat(name_matlab + str(ct), {'tfidf': tfidf}) #print 'tfidf done:' #print tfidf C, D = IL.read_CRS_totensor( tfidf, n_features, snapshots.index(month) - flags.index(start)) #print 'C,D'#, C,D Coord_CRS_global.append(C) Data_CRS_global.append(D) triples = [] triples_data = [] #For every month in the timeline for i in range(0, len(Coord_CRS_global)): c = Coord_CRS_global[i] # For every post in this month for e in c: #Add the non-zero elements coordinates triples.append(e) for d in Data_CRS_global: for e in d: triples_data.append(e) triples = [list(i) for i in triples] try: # maxNcells=max([e[0] for e in triples]) maxNcells = len(df_classified[start]) except ValueError: print 'no non-zero element. returning None' print triples # Build a sktensor, which is ncp friendly. The dimensions have to be # N_bins x n_features x N_months. # N_months = len(Nposts) e.g, or len(Coord_CRS_global) # N_posts_total=sum(Nposts) X = sktensor.sptensor(tuple(list(np.asarray(triples).T)), triples_data, shape=(maxNcells, n_features, df_classified.shape[1])) X_approx_ks = ncp.nonnegative_tensor_factorization(X, n_topics, method='anls_bpp') A = X_approx_ks.U[0] B = X_approx_ks.U[1] C = X_approx_ks.U[2] lambdas = X_approx_ks.lmbda voc_vector = {k: v for v, k in vectorizer_s.vocabulary_.iteritems()} voc_serie = pd.Series(voc_vector) TermVectors = [] TermVectorsIndex = [] for row in B.T: row = list(row) row = [(r, row.index(r)) for r in sorted(row)[::-1]] TermVectors.append(set([voc_vector[e[1]] for e in row])) TermVectorsIndex.append([(voc_vector[e[1]], e[0]) for e in row]) for i in range(0, len(TermVectorsIndex)): TermVectorsIndex[i].sort(key=lambda tup: tup[1]) TermVectorsIndex[i] = TermVectorsIndex[i][::-1] return A, B, C, TermVectorsIndex, TermVectors, lambdas
def construct_tensor(value_list, value_index, idx_list, dims): return sptensor(tuple(np.asarray(idx_list)[value_index[0]].T), list(np.asarray(value_list)[value_index[0]]), shape=dims, dtype=np.float)