Beispiel #1
0
    def create_sptensors(self):
        """
            Create a sparse tensor
            :param :
            :return:
            """
        tuples = []
        # TODO: add edges between timeframes
        for i, (t, graph) in enumerate(graphs.iteritems()):
            for u, v in graph.edges_iter():
                tuples.append([self.node_pos[u], self.node_pos[v], i])
                tuples.append([self.node_pos[v], self.node_pos[u], i])
        triplets = np.array(list(set([(u, v, t) for u, v, t in tuples])))
        a = sptensor(tuple(triplets.T),
                     vals=np.ones(len(triplets)),
                     shape=(len(self.node_ids), len(self.node_ids),
                            len(graphs)))
        o_values = []
        # FIXME: Tensor O should be columnt normalized
        sum_rows = np.zeros((a.shape[0], a.shape[2]))
        for t in range(a.shape[2]):
            for i in range(a.shape[0]):
                for j in range(a.shape[1]):
                    # TODO : just add another for loop instead of : to access .sum()
                    # TODO : check sparse tensor performance and library
                    sum_rows[i, t] += a[i, j, t]
            for i in range(a.shape[0]):
                if sum_rows[i, t] != 0:
                    for j in range(i):
                        if a[i, j, t] != 0:
                            o_values.append(a[j, i, t] / sum_rows[j, t])
                            if i != j:
                                o_values.append(a[i, j, t] / sum_rows[i, t])

        o = sptensor(tuple(triplets.T),
                     vals=o_values,
                     shape=(len(self.node_ids), len(self.node_ids),
                            len(graphs)))
        r_values = []
        sum_time = np.zeros((a.shape[0], a.shape[1]))
        for i in range(a.shape[0]):
            # OPTIMIZE: sum is a dense matrix/array. Should be sparse for memory
            for j in range(a.shape[1]):
                for t in range(a.shape[2]):
                    # TODO : just add another for loop instead of : to access .sum()
                    # TODO : check sparse tensor performance and library
                    if a[i, j, t] != 0:
                        sum_time[i, j] += a[i, j, t]
        for t in range(a.shape[2]):
            for i in range(a.shape[0]):
                for j in range(i):
                    if a[j, i, t] != 0:
                        r_values.append(a[j, i, t] / sum_time[j, i])
                        r_values.append(a[i, j, t] / sum_time[i, j])
        r = sptensor(tuple(triplets.T),
                     vals=r_values,
                     shape=(len(self.node_ids), len(self.node_ids),
                            len(graphs)))
        return a, o, r, sum_rows, sum_time
Beispiel #2
0
def build_sparse_B_from_A(A):
    """
        Create the sptensor adjacency tensor of a networkX graph.

        Parameters
        ----------
        A : list
            List of MultiDiGraph NetworkX objects.

        Returns
        -------
        data : sptensor
               Graph adjacency tensor.
        data_T : sptensor
                 Graph adjacency tensor (transpose).
        v_T : ndarray
              Array with values of entries A[j, i] given non-zero entry (i, j).
        rw : list
             List whose elements are reciprocity (considering the weights of the edges) values, one per each layer.
    """

    N = A[0].number_of_nodes()
    L = len(A)
    rw = []

    d1 = np.array((), dtype='int64')
    d2, d2_T = np.array((), dtype='int64'), np.array((), dtype='int64')
    d3, d3_T = np.array((), dtype='int64'), np.array((), dtype='int64')
    v, vT, v_T = np.array(()), np.array(()), np.array(())
    for l in range(L):
        b = nx.to_scipy_sparse_matrix(A[l])
        b_T = nx.to_scipy_sparse_matrix(A[l]).transpose()
        rw.append(np.sum(b.multiply(b_T))/np.sum(b))
        nz = b.nonzero()
        nz_T = b_T.nonzero()
        d1 = np.hstack((d1, np.array([l] * len(nz[0]))))
        d2 = np.hstack((d2, nz[0]))
        d2_T = np.hstack((d2_T, nz_T[0]))
        d3 = np.hstack((d3, nz[1]))
        d3_T = np.hstack((d3_T, nz_T[1]))
        v = np.hstack((v, np.array([b[i, j] for i, j in zip(*nz)])))
        vT = np.hstack((vT, np.array([b_T[i, j] for i, j in zip(*nz_T)])))
        v_T = np.hstack((v_T, np.array([b[j, i] for i, j in zip(*nz)])))
    subs_ = (d1, d2, d3)
    subs_T_ = (d1, d2_T, d3_T)
    data = skt.sptensor(subs_, v, shape=(L, N, N), dtype=v.dtype)
    data_T = skt.sptensor(subs_T_, vT, shape=(L, N, N), dtype=vT.dtype)

    return data, data_T, v_T, rw
Beispiel #3
0
    def update_expec(self, data, avgs):
        if isinstance(data, skt.dtensor):
            ind = data.nonzero()
            non_zero_ent = data[ind]
        elif isinstance(data, skt.sptensor):
            ind = data.subs
            non_zero_ent = data.vals

        size_ind = ind[0].size
        log_a = np.ones((size_ind, self.comp))

        for m in xrange(self.modes):
            log_a *= avgs[m][ind[m], :]
        log_a = np.log(log_a.sum(axis=1))
        q = np.empty((size_ind, self.n_trunc), dtype=np.float64)

        for i in range(1, self.n_trunc + 1):
            q[:, i - 1] = (-i * self.lam) + (non_zero_ent -
                                             i) * np.log(i) + i * log_a + i - 1
        norm = logsumexp(q, axis=1)
        q = np.exp(q - norm[:, np.newaxis])
        self.expec = np.zeros((size_ind, ))
        for i in range(1, self.n_trunc + 1):
            self.expec += i * q[:, i - 1]

        self.expec = skt.sptensor(ind,
                                  self.expec,
                                  shape=self.dim,
                                  dtype=np.float64)
Beispiel #4
0
def build_sparse_B_from_A(A):
    """
        Create the sptensor adjacency tensor of a networkX graph.

        Parameters
        ----------
        A : list
            List of MultiDiGraph NetworkX objects.

        Returns
        -------
        data : sptensor
               Graph adjacency tensor.
    """

    N = A[0].number_of_nodes()
    L = len(A)

    d1 = np.array((), dtype='int64')
    d2 = np.array((), dtype='int64')
    d3 = np.array((), dtype='int64')
    v = np.array(())
    for l in range(L):
        b = nx.to_scipy_sparse_matrix(A[l])
        nz = b.nonzero()
        d1 = np.hstack((d1, np.array([l] * len(nz[0]))))
        d2 = np.hstack((d2, nz[0]))
        d3 = np.hstack((d3, nz[1]))
        v = np.hstack((v, np.array([b[i, j] for i, j in zip(*nz)])))
    subs_ = (d1, d2, d3)
    data = skt.sptensor(subs_, v, shape=(L, N, N), dtype=v.dtype)

    return data
Beispiel #5
0
    def __init__(self, subs, vals, shape=None, dtype=int, accumfun=sum.__call__):
        if len(vals) <= 0:
            ValueError("the input tensor is ZERO!")

        subs = np.asarray(subs)
        ns, ndims = subs.shape
        self._dimsmin_ = np.min(subs, 0)
        self._dimsmap_ = list()
        for d in range(ndims):
            undim = np.unique(subs[:, d])
            self._dimsmap_.append(dict(zip(undim, range(len(undim)))))

        nwsubs = list()
        for k in range(ns):
            term = list()
            for d in range(ndims):
                term.append(self._dimsmap_[d][subs[k, d]])
            nwsubs.append(np.asarray(term))

        tensor = sptensor(tuple(np.asarray(nwsubs).T), np.asarray(vals),
                          shape, dtype, accumfun=accumfun)
        self.data = dict(zip(map(tuple, np.asarray(tensor.subs).T), tensor.vals))
        self.shape = tensor.shape
        self.ndim = tensor.ndim
        self.nnz = tensor.nnz()
        self.vals = np.sum(self.data.values())
Beispiel #6
0
def test_spttv(subs, vals, shape):
    S = sptensor(subs, vals, shape=shape)
    K = ktensor([
        np.random.randn(shape[0], 2),
        np.random.randn(shape[1], 2),
        np.random.randn(shape[2], 2)
    ])
    K.innerprod(S)
Beispiel #7
0
def test_spttv():
    subs = (
        array([0, 1, 0, 5, 7, 8]),
        array([2, 0, 4, 5, 3, 9]),
        array([0, 1, 2, 2, 1, 0])
    )
    vals = array([1, 1, 1, 1, 1, 1])
    S = sptensor(subs, vals, shape=[10, 10, 3])
    K = ktensor([randn(10, 2), randn(10, 2), randn(3, 2)])
    K.innerprod(S)
Beispiel #8
0
def test_spttv():
    # subs = (
    #    array([0, 1, 0, 5, 7, 8]),
    #    array([2, 0, 4, 5, 3, 9]),
    #    array([0, 1, 2, 2, 1, 0])
    # )
    # vals = array([1, 1, 1, 1, 1, 1])
    S = sptensor(subs, vals, shape=shape)
    K = ktensor([randn(shape[0], 2), randn(shape[1], 2), randn(shape[2], 2)])
    K.innerprod(S)
Beispiel #9
0
def test_spttv():
    #subs = (
    #    array([0, 1, 0, 5, 7, 8]),
    #    array([2, 0, 4, 5, 3, 9]),
    #    array([0, 1, 2, 2, 1, 0])
    #)
    #vals = array([1, 1, 1, 1, 1, 1])
    S = sptensor(subs, vals, shape=shape)
    K = ktensor([randn(shape[0], 2), randn(shape[1], 2), randn(shape[2], 2)])
    K.innerprod(S)
Beispiel #10
0
    def _update_theta_gamma(self, m):
        subs_I_M = np.where(self.y_E_DIMS > 1e-4)
        y_spt_DIMS = skt.sptensor(subs_I_M,
                                  self.y_E_DIMS[subs_I_M],
                                  shape=self.y_E_DIMS.shape,
                                  dtype=float)
        tmp_DIMS = y_spt_DIMS.vals / self._reconstruct_nz(y_spt_DIMS.subs)
        uttkrp_nonzero_DK = sp_uttkrp(tmp_DIMS, y_spt_DIMS.subs, m,
                                      self.theta_G_DK_M)

        self.theta_shp_DK_M[
            m][:, :] = self.alpha + self.theta_G_DK_M[m] * uttkrp_nonzero_DK
Beispiel #11
0
    def assignBlock(self, i, s, X,Y, Z1, Z2, S, tensor_dim_ceiling, subs_idx, num_workers, tensor_dim_size):
        _dict = {}
        num_ways = len(tensor_dim_ceiling)
        strata_index = [int(math.floor(i + sum([float(s) / num_workers**way_index for way_index in range(way_index+1)]))) % num_workers for way_index in range(num_ways)]
        strata_range = [range(int(math.ceil(strata_index[way_index] * tensor_dim_ceiling[way_index])), int(math.ceil((strata_index[way_index]+1) * tensor_dim_ceiling[way_index]))) for way_index in range(num_ways)]
        strata_range = [[o for o in each_range if o < tensor_dim_size[index]] for index, each_range in enumerate(strata_range)]        
        strata_range = [range(each_range[0], (each_range[-1] + 1)) for each_range in strata_range]
        total_nb_points = len(subs_idx.value)
        subs = [idx for idx in subs_idx.value if all([idx[way_index] in strata_range[way_index] for way_index in range(num_ways)])]
        subs_x = [tuple(idx) for idx in subs_idx.value if all([idx[way_index] in strata_range[way_index] for way_index in range(num_ways)])]

        X_vals = []
        Y_vals = []
        ZX_vals = []
        ZY_vals = []
        S_vals = []
        if len(subs_x) > 0:
            for i in range(len(subs_x)):
                tensor_index = tuple(np.array(subs_x[i]).T)
                X_vals.append(X[tensor_index][0])
                Y_vals.append(Y[tensor_index][0])
                ZX_vals.append(Z1[tensor_index][0])
                ZY_vals.append(Z2[tensor_index][0])
                S_vals.append(S[tensor_index][0])
            X_subs = sptensor(tuple(np.array(subs_x).T), X_vals,shape=tensor_dim_size, dtype=np.float)
            Y_subs = sptensor(tuple(np.array(subs_x).T), Y_vals,shape=tensor_dim_size, dtype=np.float)
            ZX_subs = sptensor(tuple(np.array(subs_x).T), ZX_vals,shape=tensor_dim_size, dtype=np.float)
            ZY_subs = sptensor(tuple(np.array(subs_x).T), ZY_vals,shape=tensor_dim_size, dtype=np.float)
            S_subs = sptensor(tuple(np.array(subs_x).T), S_vals,shape=tensor_dim_size, dtype=np.float)

            _dict['ratio'] = len(subs_x) / float(total_nb_points)
            _dict['X_subs'] = X_subs
            _dict['Y_subs'] = Y_subs
            _dict['ZX_subs'] = ZX_subs
            _dict['ZY_subs'] = ZY_subs
            _dict['S_subs'] = S_subs
            _dict['subs'] = subs        
            return _dict
        else:
            return None
Beispiel #12
0
def load_sptensor(fp, start_index=1, dtype=None):
    nmodes = int(fp.readline())
    ndims = tuple(map(int, fp.readline().split()))
    assert nmodes == len(ndims)
    subs = tuple(([] for m in ndims))
    vals = []
    while True:
        line = fp.readline()
        if not line:
            break
        linesep = line.split()
        for m, x in enumerate(linesep[:-1]):
            subs[m].append(int(x) - start_index)
        vals.append((dtype or float)(linesep[-1]))
    return sktensor.sptensor(subs, vals, shape=ndims, dtype=dtype)
Beispiel #13
0
 def create_sptensor(self, graphs):
     """
     Create a sparse tensor
     :param graphs:
     :return:
     """
     tuples = []
     # triplets = np.array([(u, v, t) for t in range(1, len(graphs)+1) for u, v in graphs[i].edges_iter()] +
                    # [(v, u, t) for t in range(1, len(graphs)+1) for u, v in graphs[i].edges_iter()])
     for i, graph in graphs.iteritems():
         for u, v in graph.edges_iter():
             tuples.append([self.node_pos[u], self.node_pos[v], i-1])
             tuples.append([self.node_pos[v], self.node_pos[u], i-1])
     triplets = np.array([(u, v, t) for u, v, t in tuples])
     T = sptensor(tuple(triplets.T), vals=np.ones(len(triplets)), shape=(len(self.node_ids), len(self.node_ids),
                                                                         len(graphs)))
     return T
Beispiel #14
0
def get_tensor(middle_end='sktensor', cutoff=10):
    logging.info('Reweighting: log')
    verb_tensor_path = os.path.join(
        projdir, '{}/tensor_{}.pkl'.format(middle_end, cutoff))
    if os.path.exists(verb_tensor_path):
        logging.info('Loading tensor from {}'.format(verb_tensor_path))
        tensor, indices = pickle.load(open(verb_tensor_path, mode='rb'))
        logging.debug(tensor.shape)
        return tensor, indices
    occurrence, marginals = mazsola_reader()

    def get_index(freq_dict):
        items = sorted(filter(lambda item: item[1] >= cutoff,
                              freq_dict.items()),
                       key=operator.itemgetter(1),
                       reverse=True)
        logging.debug(items[-3:])
        return dict([(w, i) for i, (w, f) in enumerate(items)])

    coords, data = ([], [], []), []
    indices = [get_index(fd) for fd in marginals]
    logging.info('Building tensor...')
    logging.info('  Pupulating lists...')
    for i, ((svo), freq) in enumerate(occurrence.items()):
        if not i % 2000000:
            logging.debug('    {:,}'.format(i))  #'{} {}'.format(svo[1], freq))
        for i, word in enumerate(svo):
            if svo[i] not in indices[i]:
                break
        else:
            for i, word in enumerate(svo):
                coords[i].append(indices[i][svo[i]])
            data.append(np.log(freq))
    logging.info('  Creating array')
    shape = tuple(map(len, indices))
    logging.info(shape)
    if middle_end == 'tensorly':
        tensor = sparse.COO(coords, data,
                            shape=shape)  #, has_duplicates=False)
    elif middle_end == 'sktensor':
        tensor = sktensor.sptensor(coords, data, shape=shape)
    else:
        raise NotImplementedError
    pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb'))
    logging.info(tensor)
    return tensor, indices
Beispiel #15
0
def sptensor_from_dense_array(X):
    """
        Create an sptensor from a ndarray or dtensor.
        Parameters
        ----------
        X : ndarray
            Input data.

        Returns
        -------
        sptensor from a ndarray or dtensor.
    """

    subs = X.nonzero()
    vals = X[subs]

    return skt.sptensor(subs, vals, shape=X.shape, dtype=X.dtype)
Beispiel #16
0
def mach(X, ranks, p):
    """
    Implementation of MACH prposed in
    C. E. Tsourakakis. Mach: Fast randomized tensor decompositions. In ICDM, pages 689–700, 2010.
    """

    prod_ns = np.prod(X.shape)
    indn = np.random.choice(prod_ns, int(prod_ns * p), replace=False)
    multinds = np.unravel_index(indn, X.shape)
    X_sp = st.sptensor(multinds, (1 / p) * X[multinds], shape=X.shape)

    ### for sparse eigen decomposition (scipy linalg problem)
    _ranks = np.array(ranks)
    _shape = np.array(X.shape)
    _ind = _ranks >= _shape
    _ranks[_ind] = _shape[_ind] - 1
    return st.tucker_hooi(X_sp, _ranks.tolist(), init='nvecs')
Beispiel #17
0
    def _init_data(self, data, mask=None):
        if isinstance(data, np.ndarray):
            data = skt.sptensor(data.nonzero(), data[data.nonzero()],
                                data.shape)
        assert isinstance(data, skt.sptensor)
        assert data.ndim == 4
        assert data.shape[0] == data.shape[1]
        V, A, T = data.shape[1:]
        self.n_actors = V
        self.n_actions = A
        self.n_timesteps = T

        if mask is not None:
            assert isinstance(mask, np.ndarray)
            assert (mask.ndim == 2) or (mask.ndim == 3)
            assert mask.shape[-2:] == (V, V)
            assert np.issubdtype(mask.dtype, np.integer)

        return data
Beispiel #18
0
    def _init_data(self, data, mask=None):
        if isinstance(data, np.ndarray):
            data = skt.sptensor(data.nonzero(),
                                data[data.nonzero()],
                                data.shape)
        assert isinstance(data, skt.sptensor)
        assert data.ndim == 4
        assert data.shape[0] == data.shape[1]
        V, A, T = data.shape[1:]
        self.n_actors = V
        self.n_actions = A
        self.n_timesteps = T

        if mask is not None:
            assert isinstance(mask, np.ndarray)
            assert (mask.ndim == 2) or (mask.ndim == 3)
            assert mask.shape[-2:] == (V, V)
            assert np.issubdtype(mask.dtype, np.integer)

        return data
Beispiel #19
0
    points_c = []
    vals = []

    for k in range(len(predicates)):
        tups = dictTrp1[k]
        for tup in tups:
            points_a.append(tup[0])
            points_b.append(tup[1])
            points_c.append(k)
            vals.append(tup[2])

    L = []
    L.append(points_a)
    L.append(points_b)
    L.append(points_c)
    X1 = sptensor(tuple(L), vals, shape=(I, J, len(predicates)), dtype=float)

    points_a = []
    points_b = []
    points_c = []
    vals = []

    for k in range(len(predicates)):
        tups = dictTrp2[k]
        for tup in tups:
            points_a.append(tup[0])
            points_b.append(tup[1])
            points_c.append(k)
            vals.append(tup[2])

    L = []
Beispiel #20
0
    def get_tensor(self, log=True, divide_by_marginal=False):
        def get_index(freq_dict):
            items = sorted(filter(lambda item: item[1] >= self.cutoff,
                                  freq_dict.items()),
                           key=operator.itemgetter(1),
                           reverse=True)
            logging.debug(items[-3:])
            return dict([(w, i) for i, (w, f) in enumerate(items)])

        verb_tensor_path = os.path.join(self.projdir, '{}_{}.pkl').format(
            'pmi' if divide_by_marginal else 'logfreq',  # TODO
            self.cutoff)
        if False:  #os.path.exists(verb_tensor_path):
            logging.info('Loading tensor from {}'.format(verb_tensor_path))
            tensor, indices = pickle.load(open(verb_tensor_path, mode='rb'))
            logging.debug(tensor.shape)
            return tensor, indices
        occurrence, marginals = self.mazsola_reader()
        coords, data = tuple([] for _ in range(self.ndim)), []
        indices = [get_index(fd) for fd in marginals]
        logging.info('Building tensor...')
        logging.info('  Pupulating lists...')
        total = 0
        for i, ((svo), freq) in enumerate(occurrence.items()):
            if not i % 2000000:
                logging.debug('    {:,}'.format(i))
            for j, word in enumerate(svo):
                if svo[j] not in indices[j]:
                    break
            else:
                for i, word in enumerate(svo):
                    coords[i].append(indices[i][svo[i]])
                to_debug = (coords[0][-1] == 1 and coords[1][-1] == 1
                            and coords[2][-1] == 0)
                #if not pmi:
                # TODO freq += 1
                if to_debug:
                    logging.debug(freq)
                total += freq
                if to_debug:
                    logging.debug(freq)
                if divide_by_marginal:
                    # TODO PPMI
                    for i in range(self.ndim):
                        freq /= marginals[i][svo[i]]
                        if to_debug:
                            logging.debug((marginals[i][svo[i]], freq))
                if log:
                    freq = np.log(freq)
                if to_debug:
                    logging.debug(freq)
                data.append(freq)
        logging.info('  Total: {}'.format(total))
        logging.info('  Creating array')
        shape = tuple(map(len, indices))
        logging.info(shape)
        data = np.array(data)
        if divide_by_marginal:
            if log:
                data += 2 * np.log(total)
            else:
                data *= total**2
        tensor = sktensor.sptensor(coords, data, shape=shape)
        #pickle.dump((tensor, indices), open(verb_tensor_path, mode='wb'))
        logging.info(tensor)
        return tensor, indices
Beispiel #21
0
import numpy.random as rn
import cPickle as pickle
import numpy as np
import sktensor as skt

data = rn.poisson(0.2, size=(25, 25, 100))  # 3-mode SPARSE count tensor of size 10 x 8 x 3

subs = data.nonzero()                    # subscripts where the ndarray has non-zero entries   
vals = data[data.nonzero()]              # corresponding values of non-zero entries
sp_data = skt.sptensor(subs,             # create an sktensor.sptensor 
                       vals,
                       shape=data.shape,
                       dtype=data.dtype)

with open('data.dat', 'w+') as f:            # can be stored as a .dat using pickle
    pickle.dump(sp_data, f)

with open('data.dat', 'r') as f:             # can be loaded back in using pickle.load
    tmp = pickle.load(f)
    assert np.allclose(tmp.vals, sp_data.vals)
Beispiel #22
0
 def tosptensor(self):
     return sptensor(tuple(np.asarray(self.data.keys()).T), np.asarray(self.data.values()), self.shape)
Beispiel #23
0
def HOALS(data,dims,ranks,model='tucker',lambda_=0.8,alpha=0.1,num_iters=5,implicit=False):
    """
    Parameters
    data : DataFrame	
        [0] : userId
        [1] : itemId
        [2] : actionId
        [3] : rating
    dims : list
        [0] : number of users
        [1] : number of items
        [2] : number of actions
    """
    data.columns = np.arange(data.shape[1])
    C_train = sktensor.sptensor((data[2],data[0],data[1]),
	        data[3],shape=(dims[2],dims[0],dims[1]))

    #==============================================================================
    # recuparation of the (user,item,rate) of the unfold matrix
    #==============================================================================

    # train set
    C1 = sktensor.csr_matrix(C_train.unfold(1))
    y1 = list(C1.indices)
    indptr1 = C1.indptr
    r1 = list(C1.data)
    tmp1 = indptr1[1:len(indptr1)]-indptr1[0:(len(indptr1)-1)]
    x1 = []
    for i in np.arange(len(tmp1)):
        x1.extend(np.repeat(i,tmp1[i]))

    C2 = sktensor.csr_matrix(C_train.unfold(2))
    y2 = list(C2.indices)
    indptr2 = C2.indptr
    r2 = list(C2.data)
    tmp2 = indptr2[1:len(indptr2)]-indptr2[0:(len(indptr2)-1)]
    x2 = []
    for i in np.arange(len(tmp2)):
        x2.extend(np.repeat(i,tmp2[i]))
        
    C3 = sktensor.csr_matrix(C_train.unfold(0))
    y3 = list(C3.indices)
    indptr3 = C3.indptr
    r3 = list(C3.data)
    tmp3 = indptr3[1:len(indptr3)]-indptr3[0:(len(indptr3)-1)]
    x3 = []
    for i in np.arange(len(tmp3)):
        x3.extend(np.repeat(i,tmp3[i]))

    dataTrain = {}
    dataTrain[0] = pd.DataFrame([x1,y1,r1]).T
    dataTrain[1] = pd.DataFrame([x2,y2,r2]).T
    dataTrain[2] = pd.DataFrame([x3,y3,r3]).T
    
    dataTrain[0] = dataTrain[0][dataTrain[0][2]!=0] # where the rating is not null
    dataTrain[1] = dataTrain[1][dataTrain[1][2]!=0]
    dataTrain[2] = dataTrain[2][dataTrain[2][2]!=0]

    #==============================================================================
    # Factorization
    #==============================================================================
    ratings = {}
    res = {}
    features = {}
    times = []
    for i in range(3):
        if i==0:
            mode = 'User'
        elif i==1:
            mode = 'Item'
        elif i==2:
            mode = 'Action'

        print("Start "+mode+" Learning")
        
        dataTrain[i] = sqlContext.createDataFrame(dataTrain[i]).rdd
        ratings[i] = dataTrain[i].map(lambda l: Rating(float(l[0]), float(l[1]), float(l[2])))
        #ratings[i] = dataTrain[i].map(lambda l: array([float(l[0]), float(l[1]), float(l[2])]))

      # Build the recommendation model using Alternating Least Squares
        t0 = time.time()
        if implicit:
            res[i] = ALS.trainImplicit(ratings=ratings[i], rank=ranks[i], iterations=num_iters, seed=0, lambda_=lambda_, alpha=alpha)
        else:
            res[i] = ALS.train(ratings=ratings[i], rank=ranks[i], iterations=num_iters, seed=0, lambda_=lambda_)
        t1 = time.time()
        delta = t1-t0
        print('time :',delta)
        times.append(delta)

        #features[i] = res[i].userFeatures()
	
    print('longest mode time :',np.max(times))
Beispiel #24
0

subs_1 = np.append(data[:,:2], np.zeros((n, 1)), 1)
subs_2 = np.append(data[:,:2], np.ones((n, 1)), 1)

subs = np.vstack([subs_1, subs_2])
subs = subs.astype(int)

vals = np.hstack([data[:,2], data[:, 3]])
vals = vals.flatten()

# convert subs tuple of arrays (rows, cols, tubes)
subs = (subs[:,0], subs[:,1], subs[:,2])

# load into sparse tensor
T = sptensor(subs, vals)




logging.debug("Starting Tucker decomposition")

#T = loadmat('../datasets/alyawarra/alyawarradata.mat')['Rs']
#X = [lil_matrix(T[:, :, k]) for k in range(T.shape[2])]

#X = [lil_matrix(T[:, :, k]) for k in range(T.shape[2])]

# Decompose tensor using RESCAL-ALS
P = tucker_hooi(T, [10, 10, 2], init='random')
logging.debug("Finished tucker decomposition")
Beispiel #25
0
def main(n_top_words, alpha, beta, rank, priv, n_iters=200):
    # output_data_shape = (n_docs, n_words)
    # theta_DK = np.random.gamma(alpha, beta, (n_docs, rank))
    # phi_KV = np.random.gamma(alpha, beta, (rank, n_words))
    # poisson_priors_DV = parafac((theta_DK, phi_KV.T))
    # data_DV = np.random.poisson(poisson_priors_DV, output_data_shape)

    with np.load('sotu_years.npz') as dat_file:
        data_DV = dat_file['Y_DV']
        vocab = dat_file['types_V']
    n_docs, n_words = data_DV.shape
    bpptf_model = BPPTF(n_modes=2, n_components=rank, verbose=True, max_iter=1)
    bptf_model = BPTF(n_modes=2, n_components=rank, verbose=True, max_iter=1)

    # initialize both models
    modes = (0, 1)
    data_usable = preprocess(data_DV)
    if isinstance(data_usable, skt.dtensor):
        bpptf_model.data_DIMS = data_usable.copy()
    else:
        bpptf_model.data_DIMS = skt.sptensor(
            tuple((np.copy(ds) for ds in data_usable.subs)),
            data_usable.vals.copy())

    bpptf_model._init_all_components(data_usable.shape)
    bptf_model._init_all_components(data_usable.shape)

    bpptf_model.y_E_DIMS = data_usable
    if isinstance(data_usable, skt.sptensor):
        bpptf_model.y_E_DIMS = bpptf_model.y_E_DIMS.toarray()
    for i in range(n_iters):
        print i
        for m in modes:
            # check_equal(bpptf_model, bptf_model, m)

            bpptf_model._update_theta_gamma(m)
            bptf_model._update_gamma(m, data_usable)
            # check_equal(bpptf_model, bptf_model, m)

            bpptf_model._update_theta_delta(m, None)
            bptf_model._update_delta(m, None)
            # check_equal(bpptf_model, bptf_model, m)

            bpptf_model._update_cache(m)
            bptf_model._update_cache(m)
            # check_equal(bpptf_model, bptf_model, m)

            bpptf_model._update_beta(m)  # must come after cache update!
            bptf_model._update_beta(m)
            # check_equal(bpptf_model, bptf_model, m)

            bpptf_model._check_component(m)
            bptf_model._check_component(m)
            # check_equal(bpptf_model, bptf_model, m)

    print "Old topics"
    new_phi = bptf_model.E_DK_M[1].T
    top_words = np.argpartition(new_phi, n_words - n_top_words)[:,
                                                                -n_top_words:]
    for topic in xrange(rank):
        top_word_vals = zip(-new_phi[topic, top_words[topic]],
                            vocab[top_words[topic]])
        print topic, ' '.join(
            ['{}'.format(wd) for (_, wd) in sorted(top_word_vals)])

    print "\nNew topics"
    new_phi = bpptf_model.theta_E_DK_M[1].T
    top_words = np.argpartition(new_phi, n_words - n_top_words)[:,
                                                                -n_top_words:]
    for topic in xrange(rank):
        top_word_vals = zip(-new_phi[topic, top_words[topic]],
                            vocab[top_words[topic]])
        print topic, ' '.join(
            ['{}'.format(wd) for (_, wd) in sorted(top_word_vals)])
Beispiel #26
0
def sptensor_from_dense_array(X):
    """Creates an sptensor from an ndarray or dtensor."""
    subs = X.nonzero()
    vals = X[subs]
    return skt.sptensor(subs, vals, shape=X.shape, dtype=X.dtype)
Beispiel #27
0
 def fromarray(A):
     """Create a sptensor from a dense numpy array"""
     subs = np.nonzero(A)
     vals = A[subs]
     return sptensor(subs, vals, shape=A.shape, dtype=A.dtype)
Beispiel #28
0
# Generate a synthetic toy data set
true_A_IK = prg(alpha, lambd,
                size=(n_genes, n_feats))  # synthetic genes x feats matrix
true_P_JK = prg(alpha, lambd,
                size=(n_cells, n_feats))  # synthetic cells x feats matrix
true_M_IJ = true_A_IK.dot(true_P_JK.T)  # synthetic mean of observed counts
true_Y_IJ = np.zeros_like(true_M_IJ, dtype=int)  # synthetic observed counts
true_Y_IJ[true_M_IJ > 0] = rn.poisson(true_M_IJ[true_M_IJ > 0])

subs = true_Y_IJ.nonzero()  # subscripts where the ndarray has non-zero entries
vals = true_Y_IJ[
    true_Y_IJ.nonzero()]  # corresponding values of non-zero entries
sp_data = skt.sptensor(
    subs,  # create an sktensor.sptensor 
    vals,
    shape=true_Y_IJ.shape,
    dtype=true_Y_IJ.dtype)

sns.heatmap(true_Y_IJ, cmap='Blues')
plt.show()

model = PRGPMF(n_genes=n_genes,
               n_cells=n_cells,
               n_feats=n_feats,
               alpha=alpha,
               lambd=lambd,
               seed=seed,
               n_threads=n_threads)

n_samples = 100  # how many posterior samples to collect
Beispiel #29
0
 def RDFParsing(self,path,isSparse):
     #parse RDF       
     g=rdflib.Graph()
     g.parse(file=open(path, "r"),
             format="application/rdf+xml")        
     entities = []       
     predicates = []
    
     for s in g.subjects(None, None):
         ss=s.encode('utf-8').__str__()
         if  ss not in entities:                
             entities.extend([ss])                
     for o in g.objects(None, None):
         oo=o.encode('utf-8').__str__()
         if type(o).__name__ != "Literal" :
             if oo not in entities:
                 entities.extend([oo])
     for p in g.predicates(None, None):           
         pp=p.encode('utf-8').__str__()
         if pp not in predicates:                
             predicates.extend([pp])        
    
     entities = np.array(entities)
     
     #print entities    
     logging.warning( "*************************"  )  
     predicates = np.array(predicates)        
     #print predicates    
     
     #prepare tensor frontal slices as np matrices
     lenentities = len(entities)
     lenpredicates = len(predicates)               
     logging.warning("Tensor: "+str(lenentities)+" X "+str(lenentities)+" X "+str(lenpredicates))        
     T = np.zeros((lenentities, lenentities, lenpredicates),dtype=np.int)           
     print "entities".join(map(str, entities))
     
     for s,p,o in g:
         try:
             logging.warning("try to build the tensor")
             ss=s.__str__().encode('utf-8')
             oo=o.__str__().encode('utf-8')
             pp=p.__str__().encode('utf-8')
             i,j,k=entities.tolist().index(ss),entities.tolist().index(oo),predicates.tolist().index(pp)
             #logging.warning("ijk".join(map(str, [i,j,k])))                
             #print "ijk".join(map(str, [i,j,k]))
             value=1.0
             if "__" in p:
                 #print p.split("__")[len(p.split("__"))-1]
                 value=float(p.split("__")[len(p.split("__"))-1])
                 #print "value"+value
             T[i, j, k] = value
         except:
             #printException()        
             continue
     if isSparse:
         xyz, c = self.sparsed(T)
         logging.warning("xyz: ")
         logging.warning("-".join(map(str, xyz)))
         #logging.warning("c: ".join(c))
         Tensor = sptensor(xyz, c, shape=(lenentities, lenentities, lenpredicates), dtype=np.int)                
     else:
         Tensor = dtensor(T)        
     #print "*************************"
     #print Tensor[:,:,slice] 
     return Tensor,g,entities,predicates
Beispiel #30
0
def sptensor_from_dense_array(X):
    """Creates an sptensor from an ndarray or dtensor."""
    subs = X.nonzero()
    vals = X[subs]
    return skt.sptensor(subs, vals, shape=X.shape, dtype=X.dtype)
Beispiel #31
0
def run_case_study():
    iter_cnt = 10
    nb_trial = 3

    alg_names = [PAIRFAC]
    # alg_names = [SDCDT]
    iters = [iter_cnt] * len(alg_names)

    if alg_names[0].__name__ in ["SDCDT"]:
        distance = int(sys.argv[1])
        alpha = float(sys.argv[2])
        beta = float(sys.argv[3])

        alpha_pars = [alpha]
        beta_pars = [beta]
        gamma_pars = [1e+0]
        delta_pars = [1e+0]

    else:
        distance = 3
        alpha = float(sys.argv[1])
        beta = float(sys.argv[2])
        gamma = float(sys.argv[3])
        alpha_pars = [alpha]
        beta_pars = [beta]
        gamma_pars = [gamma]
        delta_pars = [1e-8]

    case_study = "ha_{}mp4d".format(str(sys.argv[4]))
    case_study = "wpi_{}mp4d".format(str(sys.argv[4]))
    sub_dir = "classification"
    num_workers = 2
    nb_points = 40000  # nyc
    train_proportions = [10]
    layers = [0]

    _type = "function"

    if case_study.startswith("wpi_"):
        nb_points = 4000
        dims = [10, 59, 2]
        dims = [266, 59, 4, 10]
        if case_study.endswith("4d"):
            dims = [376, 58, 7, 10]
            if "5" in case_study:
                dims = [266, 59, 4, 10]
            if "6" in case_study:
                dims = [408, 68, 4, 10]
        domain = "math"
        source1 = "good"
        source2 = "bad"
        R_set = {0: 6}
        R_check = 6

    if case_study.startswith("ha_"):
        nb_points = 4000
        dims = [5, 34, 6, 54]
        domain = "mooc"
        source1 = "good"
        source2 = "bad"
        R_set = {0: 6}
        R_check = 6

    k = domain + "_" + case_study

    bootstrap_seed_list = [0, 1, 2, 3, 4]
    for bootstrap_seed in bootstrap_seed_list:

        _log.info("distance:{}".format(distance))
        _log.info("bootstrap_seed:{}".format(bootstrap_seed))
        from datetime import datetime
        idx_list, value_list_X, value_list_Y, value_list_ZX, value_list_ZY, value_list_S = read_domain_data(
            source1,
            source2,
            dims,
            domain,
            _type,
            case_study,
            k,
            nb_points,
            bootstrap_seed=bootstrap_seed)
        X = sptensor(tuple(np.asarray(idx_list).T),
                     value_list_X,
                     shape=dims,
                     dtype=np.float)
        Y = sptensor(tuple(np.asarray(idx_list).T),
                     value_list_Y,
                     shape=dims,
                     dtype=np.float)

        conf = SparkConf().setAppName("PairFac...")
        sc = SparkContext(conf=conf)

        Lambda_all = [
            alpha_pars, beta_pars, gamma_pars, delta_pars, train_proportions
        ]
        Lambda_all = list(itertools.product(*Lambda_all))

        len_paraset = len(Lambda_all)
        cur_para_index = 0

        train_cur_proportion = float(10)
        test_portion = (10.0 - train_cur_proportion) / 2.0
        np.random.seed(2)
        Weight = np.random.choice(
            [0, 1, 2, 3],
            size=(len(value_list_ZX), ),
            p=[
                test_portion / 10, test_portion / 10,
                train_cur_proportion / 10,
                (10.0 - train_cur_proportion - test_portion * 2) / 10
            ])
        train_index_1 = train_index_2 = np.where(Weight == 2)
        validation_index_1 = validation_index_2 = np.where(Weight == 1)
        test_index_1 = test_index_2 = np.where(Weight == 0)

        X_train = construct_tensor(value_list_X, train_index_1, idx_list, dims)
        Y_train = construct_tensor(value_list_Y, train_index_1, idx_list, dims)
        ZX_train = construct_tensor(value_list_ZX, train_index_1, idx_list,
                                    dims)
        ZY_train = construct_tensor(value_list_ZY, train_index_1, idx_list,
                                    dims)
        S_train = construct_tensor(value_list_S, train_index_1, idx_list, dims)

        non_zero_idxs = np.asarray(idx_list)[train_index_1[0]]
        D_matrix = np.zeros((X_train.shape[0], X_train.shape[0]))
        W_matrix = np.zeros((X_train.shape[0], X_train.shape[0]))

        for alg_name, iter_cnt in zip(alg_names, iters):
            for each_lambda in Lambda_all:
                progress = cur_para_index * 1.0 / len_paraset
                cur_para_index += 1
                _log.info('[{}] Running {}...'.format(alg_names[0].__name__,
                                                      progress))
                alg = alg_name()
                Lambda = list(each_lambda)
                cur_paras = '_'.join([str(x) for x in each_lambda])
                fname = 'weight_s_t_2_{}_layer_0_distance_{}_seed_{}_R_{}'.format(
                    alg.__class__.__name__, distance, bootstrap_seed, R_check)
                directory_ = PROJECT_DIR + "/output/output_" + k + "_" + sub_dir + ""
                if not os.path.exists(directory_):
                    os.makedirs(directory_)

                layer_fileName = directory_ + "/weights/" + str(
                    cur_paras) + "/" + str(fname)

                embeddings_dir = directory_ + "/embeddings/" + str(
                    alg_names[0].__name__) + "/" + str(
                        bootstrap_seed) + "/" + cur_paras
                if not os.path.exists(embeddings_dir):
                    os.makedirs(embeddings_dir)
                weights_dir = directory_ + "/weights/" + cur_paras
                if not os.path.exists(weights_dir):
                    os.makedirs(weights_dir)
                if os.path.exists(layer_fileName):
                    _log.info("{} exists".format(layer_fileName))
                    continue

                alg.run_multi_trials( sc, X_train, Y_train, ZX_train, ZY_train, S_train, k,k, Lambda, D_matrix, W_matrix, \
                    num_trial=nb_trial, max_iter=iter_cnt, verbose=2,
                    noise=0.01,nb_points=nb_points,
                    non_zero_idxs=non_zero_idxs,
                    num_workers=num_workers,distance=distance, R_set = R_set,layers = layers,bootstrap_seed = bootstrap_seed)

                gc.collect()

        sc.stop()
Beispiel #32
0
def NTF_sampling_24H(sampling,
                     df_classified,
                     flags,
                     start,
                     stop,
                     path,
                     vectorizer_new=0,
                     vectorizer_s=0,
                     n_topics=10,
                     n_features=1000,
                     matlab=False,
                     monuments=False):
    # Feed the vectorizer with all the words in the dataset. Counts is the tweet/term matrix.
    # fit_transform: fit first (build the features list with the relevant words)
    # then transform: build the tweet/term matrix with the relevant tokens.
    if not vectorizer_new:
        print 'No vectorizer defined. Returning None'
        return None
    if matlab:
        name_matlab = path + 'matlab/TorInst{}Matr'.format(n_features)
    Coord_CRS_global = []
    Data_CRS_global = []
    Ncells = []
    snapshots = df_classified.columns.tolist()[0:-1]
    if start >= stop:
        print 'incorrect start and/or stop dates. performing NTF on whole passed dataset'
        stop = min(len(flags), len(snapshots))

    #For every snapshot taken
    ct = 0
    for month in snapshots:
        ct += 1
        print flags[snapshots.index(month)]
        This_Month = df_classified[month].tolist()
        #print len(list_reviews_rest), 'tagged cells for ', year_month

        # Learn the vocabulary dictionary and return term-document matrix.
        print len(This_Month)
        counts = vectorizer_new.transform(This_Month)
        #Transform a count matrix to a normalized tf-idf representation.
        #(i.e terms with frequencies too hi or lo are removed)
        # Weights are indexed by (postID, term): weight
        tfidf = TfidfTransformer().fit_transform(counts)
        if matlab:
            savemat(name_matlab + str(ct), {'tfidf': tfidf})
        #print 'tfidf done:'
        #print tfidf
        C, D = IL.read_CRS_totensor(
            tfidf, n_features,
            snapshots.index(month) - flags.index(start))
        #print 'C,D'#, C,D
        Coord_CRS_global.append(C)
        Data_CRS_global.append(D)

    triples = []
    triples_data = []
    #For every month in the timeline
    for i in range(0, len(Coord_CRS_global)):
        c = Coord_CRS_global[i]
        # For every post in this month
        for e in c:
            #Add the non-zero elements coordinates
            triples.append(e)

    for d in Data_CRS_global:
        for e in d:
            triples_data.append(e)
    triples = [list(i) for i in triples]
    try:
        #        maxNcells=max([e[0] for e in triples])
        maxNcells = len(df_classified[start])
    except ValueError:
        print 'no non-zero element. returning None'
        print triples

    # Build a sktensor, which is ncp friendly. The dimensions have to be
    # N_bins x n_features x N_months.
    # N_months = len(Nposts) e.g, or len(Coord_CRS_global)
    # N_posts_total=sum(Nposts)
    X = sktensor.sptensor(tuple(list(np.asarray(triples).T)),
                          triples_data,
                          shape=(maxNcells, n_features,
                                 df_classified.shape[1]))
    X_approx_ks = ncp.nonnegative_tensor_factorization(X,
                                                       n_topics,
                                                       method='anls_bpp')
    A = X_approx_ks.U[0]
    B = X_approx_ks.U[1]
    C = X_approx_ks.U[2]
    lambdas = X_approx_ks.lmbda

    voc_vector = {k: v for v, k in vectorizer_s.vocabulary_.iteritems()}
    voc_serie = pd.Series(voc_vector)
    TermVectors = []
    TermVectorsIndex = []
    for row in B.T:
        row = list(row)
        row = [(r, row.index(r)) for r in sorted(row)[::-1]]
        TermVectors.append(set([voc_vector[e[1]] for e in row]))
        TermVectorsIndex.append([(voc_vector[e[1]], e[0]) for e in row])
    for i in range(0, len(TermVectorsIndex)):
        TermVectorsIndex[i].sort(key=lambda tup: tup[1])
        TermVectorsIndex[i] = TermVectorsIndex[i][::-1]
    return A, B, C, TermVectorsIndex, TermVectors, lambdas
Beispiel #33
0
def construct_tensor(value_list, value_index, idx_list, dims):
    return sptensor(tuple(np.asarray(idx_list)[value_index[0]].T),
                    list(np.asarray(value_list)[value_index[0]]),
                    shape=dims,
                    dtype=np.float)