def generate_data(case, sparse=False):
        # Generate regression / classification data. 
        bunch = None 
        if case == 'regression':
            bunch = datasets.load_boston()
        elif case == 'classification': 
            bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
        X, y = shuffle(bunch.data, bunch.target)
        offset = int(X.shape[0] * 0.8) 
        X_train, y_train = X[:offset], y[:offset]
        X_test, y_test = X[offset:], y[offset:] 
        if sparse:
            X_train = csr_matrix(X_train)
            X_test = csr_matrix(X_test)
        else:
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_train = np.array(y_train)
        data = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
        }

        return data 
Esempio n. 2
0
def mkneighbors_graph(observations, n_neighbours, metric, mode='connectivity', metric_params = None):
    """
    Computes the (weighted) graph of mutual k-Neighbors for observations.
    
    Notes
    -----
    The distance between an observation and itself is never computed and instead set to
    ``numpy.inf``. I.e. only in the case of k>=n_observations or when the ``metric``
    returns ``numpy.inf``, the returned graph can contain loops.
    
    Parameters
    ----------
    observations : sequence
        Sequence of observations.
    n_neighbours : int
        Maximum number of neighbours for each sample.
    metric : function
        The distance metric taking two observations and returning a numeric value > 0.
    mode : {'connectivity', 'distance', 'both'}, optional
        Type of returned matrix: 'connectivity' will return the connectivity matrix with
        ones and zeros, in 'distance' the edges are distances between points, while
        'both' returns a (connectivity, distance) tuple.
    metric_params : dict, optional  (default = None)
            Additional keyword arguments for the metric function.
            
    Returns
    -------
    mkneighbors_graph : ndarray
        Sparse matrix in CSR format, shape = [n_observations, n_observations].
        mkneighbors_graph[i, j] is assigned the weight of edge that connects i to j.
        Might contain ``numpy.inf`` values.

    """
    # compute their pairwise-distances
    pdists = pdist(observations, metric)

    # get the k nearest neighbours for each patch 
    k_nearest_nbhs = numpy.argsort(pdists)[:,:n_neighbours]
    
    # create a mask denoting the k nearest neighbours in image_pdist
    k_nearest_mutual_nbhs_mask = numpy.zeros(pdists.shape, numpy.bool)
    for _mask_row, _nbhs_row in zip(k_nearest_mutual_nbhs_mask, k_nearest_nbhs):
        _mask_row[_nbhs_row] = True
        
    # and with transposed to remove non-mutual nearest neighbours
    k_nearest_mutual_nbhs_mask &= k_nearest_mutual_nbhs_mask.T
    
    # set distance not in the mutual k nearest neighbour set to zero
    pdists[~k_nearest_mutual_nbhs_mask] = 0
    
    # check for edges with zero-weight
    if numpy.any(pdists[k_nearest_mutual_nbhs_mask] == 0):
        warnings.warn('The graph contains at least one edge with a weight of "0".')
        
    if 'connectivity' == mode:
        return csr_matrix(k_nearest_mutual_nbhs_mask)
    elif 'distance' == mode:
        return csr_matrix(pdists)
    else:
        return csr_matrix(k_nearest_mutual_nbhs_mask), csr_matrix(pdists)
    def test_mutual_information(self):
        X = array([[0, 1],
                   [1, 0],
                   [1, 1]])
        y = array([[0, 1],
                   [1, 0],
                   [1, 0]])

        assert_array_approx_equal(mutual_information(X, y), [-0.37489, -0.605939], decimal=3)
        assert_array_approx_equal(mutual_information(csr_matrix(X), csr_matrix(y)), [-0.37489, -0.605939], decimal=3)
    def test_pointwise_mutual_information(self):
        X = array([[0, 1],
                   [1, 0],
                   [1, 1]])
        y = array([[0, 1],
                   [1, 0],
                   [1, 0]])

        assert_array_approx_equal(pointwise_mutual_information(X, y), [0.1178, 0.1178], decimal=3)
        assert_array_approx_equal(pointwise_mutual_information(csr_matrix(X), csr_matrix(y)),
                                  [0.1178, 0.1178], decimal=3)
Esempio n. 5
0
    def test_BRKnna_no_labels_take_closest(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)
        knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        print(pred)
        np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
Esempio n. 6
0
    def test_BRKnna_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
Esempio n. 7
0
    def test_BRKnnb_predict_two_samples(self):
        data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
Esempio n. 8
0
    def test_inner_kneighbors_more_neighbors(self):
        X = csr.csr_matrix([[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3]])
        y = csr.csr_matrix([[0.4, 0.4, 0.4], [2.4, 2.4, 2.4], [3.1, 3.1, 3.1], [1.1, 1.1, 1.1]])
        nearest_neighbors = NearestNeighbors()
        nearest_neighbors.fit(X)
        neighbors = BatchKNeighbors(nearest_neighbors)

        kneighbors = neighbors._batch_kneighbors(y, n_neighbors=2, batchsize=1)
        np.testing.assert_array_equal(kneighbors, np.matrix([[0, 1], [2,3], [3, 2], [1,2]]))

        kneighbors = neighbors._batch_kneighbors(y, n_neighbors=2, batchsize=3)
        np.testing.assert_array_equal(kneighbors, np.matrix([[0, 1], [2,3], [3, 2], [1,2]]))
Esempio n. 9
0
    def _load_sparse_mat(filename, name):
        """
        Load a csr matrix from HDF5 (https://stackoverflow.com/a/44282655)

        Parameters
        ----------
        name: str
            node prefix in HDF5 hierarchy

        filename: str
            HDF5 filename

        Returns
        ----------
        M : scipy.sparse.csr.csr_matrix
            loaded sparse matrix
        """
        import tables
        from scipy.sparse import csr_matrix

        with tables.open_file(filename) as f:
            # get nodes
            attributes = []
            for attribute in ("data", "indices", "indptr", "shape"):
                attributes.append(
                    getattr(f.root, f"{name}_{attribute}").read())

        # construct sparse matrix
        M = csr_matrix(tuple(attributes[:3]), shape=attributes[3])
        return M
Esempio n. 10
0
 def test_correct_handling_equal_similarities_sparse_gk(self):
     sim_snn = 1. - shared_nearest_neighbors(self.distance)
     gamma_sparse = sparse_goodman_kruskal_index(csr_matrix(sim_snn),
                                                 self.labels)
     gamma_efficient = goodman_kruskal_index(sim_snn, self.labels,
                                             'similarity')
     return self.assertEqual(gamma_efficient, gamma_sparse)
Esempio n. 11
0
    def _deduplicate(self,
                     ignore_index=False) -> Union[pd.DataFrame, pd.Series]:
        # discard self-matches: A matches A
        pairs = self._matches_list[self._matches_list['master_side'] !=
                                   self._matches_list['dupe_side']]
        # rebuild graph adjacency matrix from already found matches:
        n = len(self._master)
        graph = csr_matrix(
            (np.full(len(pairs), 1),
             (pairs.master_side.to_numpy(), pairs.dupe_side.to_numpy())),
            shape=(n, n))
        # apply scipy.csgraph's clustering algorithm (result is a 1D numpy array of length n):
        _, groups = connected_components(csgraph=graph, directed=True)
        group_of_master_index = pd.Series(groups, name='raw_group_id')

        # merge groups with string indices to obtain two-column DataFrame:
        # note: the following line automatically creates a new column named 'index' with the corresponding indices:
        group_of_master_index = group_of_master_index.reset_index()

        # Determine weights for obtaining group representatives:
        # 1. option-setting group_rep='first':
        group_of_master_index.rename(columns={'index': 'weight'}, inplace=True)
        method = 'first'
        # 2. option-setting group_rep='centroid':
        if self._config.group_rep == GROUP_REP_CENTROID:
            # reuse the adjacency matrix built above (change the 1's to corresponding cosine similarities):
            graph.data = pairs['similarity'].to_numpy()
            # sum along the rows to obtain numpy 1D matrix of similarity aggregates then ...
            # ... convert to 1D numpy array (using asarray then squeeze) and then to Series:
            group_of_master_index['weight'] = pd.Series(
                np.asarray(graph.sum(axis=1)).squeeze())
            method = 'idxmax'

        # Determine the group representatives AND merge with indices:
        # pandas groupby transform function and enlargement enable both respectively in one step:
        group_of_master_index['group_rep'] = \
            group_of_master_index.groupby('raw_group_id', sort=False)['weight'].transform(method)

        # Prepare the output:
        prefix = GROUP_REP_PREFIX
        label = f'{prefix}{self._master.name}' if self._master.name else prefix[:
                                                                                -1]
        # use group rep indexes obtained in the last step above to select the corresponding strings:
        output = self._master.iloc[group_of_master_index.group_rep].rename(
            label).reset_index(drop=ignore_index)
        if isinstance(output, pd.DataFrame):
            output.rename(columns={
                col: f'{prefix}{col}'
                for col in output.columns if str(col) != label
            },
                          inplace=True)
        if self._master_id is not None:
            id_label = f'{prefix}{self._master_id.name if self._master_id.name else DEFAULT_ID_NAME}'
            # use group rep indexes obtained above to select the corresponding string IDs:
            output_id = self._master_id.iloc[
                group_of_master_index.group_rep].rename(id_label).reset_index(
                    drop=True)
            output = pd.concat([output_id, output], axis=1)
        output.index = self._master.index
        return output.squeeze()
Esempio n. 12
0
def dic2matrix(dic, ix=None, p=None, n=0, g=0):
    """
    将字典转化为FM所需的矩阵
    :param dic: 一个含有多个特征的字典
    :param ix: 下标的生成字典
    :param n: 记录条数
    :param g: 特征种类
    :return: 一个二值矩阵
    """
    if ix is None:
        ix = dict()

    # 矩阵中1的个数
    nz = n * g

    col_ix = np.empty(nz, dtype=int)

    # 勾选特征
    i = 0
    for k, lis in dic.items():
        for t in range(len(lis)):
            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k), 0) + 1
            col_ix[i + g * t] = (lis[t] - 1) * g + i
        i += 1

    # 特征空间,即矩阵的列数
    if p is None:
        p = np.max(col_ix) + 1

    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)
    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
Esempio n. 13
0
 def _write_mat(self, filename, complete=True):
     d = {
         'Af':
         csr_matrix(
             (self.pdim, self.pdim)) if self._af is None else self._af,
         'Ad':
         csr_matrix(
             (self.ndim, self.pdim)) if self._ad is None else self._ad,
         'Bf':
         csr_matrix(
             (self.mdim, self.pdim)) if self._bf is None else self._bf
     }
     if complete and self._complete:
         d['A'] = self._A
         d['B'] = self._B
     savemat(filename, d)
Esempio n. 14
0
def vectorize_dic(dic, ix=None, p=None):
    '''
    create a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature)

    :param dic: dictionary of feature lists. Keys are name of features
    :param ix:  index generator
    :param p:  dimension of feature space
    :return:
    '''
    if (ix == None):
        ix = defaultdict(count(0).next)

    n = len(dic.values()[0])  # num samples
    g = len(dic.keys())  # num groups
    nz = n * g  # number of non-zeros

    col_ix = np.empty(nz, dtype=int)

    i = 0
    for k, lis in dic.iteritems():
        # append index el with k in order to prevet mapping different columns with same id to same index
        col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]
        i += 1

    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)

    if (p == None):
        p = len(ix)

    ixx = np.where(col_ix < p)

    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])),
                          shape=(n, p)), ix
Esempio n. 15
0
def vectorize_dic(dic,ix=None,p=None,n=0,g=0):
    """
    dic -- dictionary of feature lists. Keys are the name of features
    ix -- index generator (default None)
    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)
    """
    if ix==None:
        ix = dict()

    nz = n * g

    col_ix = np.empty(nz,dtype = int)

    i = 0
    for k,lis in dic.items():
        for t in range(len(lis)):
            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k),0) + 1
            col_ix[i+t*g] = ix[str(lis[t]) + str(k)]
        i += 1

    row_ix = np.repeat(np.arange(0,n),g)
    data = np.ones(nz)
    if p == None:
        p = len(ix)

    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx],(row_ix[ixx],col_ix[ixx])),shape=(n,p)),ix
Esempio n. 16
0
        def compatible(train_attrs, test_attrs):
            # attrs = []
            # attrs.extend(train_attrs)
            # attrs.extend(test_attrs)
            #
            # self.cv.fit(attrs)
            #
            # train_features = self.cv.transform(train_attrs)
            # test_features = self.cv.transform(test_attrs)

            row_ind = []
            col_ind = []

            train_sets = [str(row).lower() for row in train_attrs]
            test_sets = [str(row).lower() for row in test_attrs]

            for (i, s_i) in enumerate(test_sets):
                # si = set(fi.indices)
                j_ind = v_compatible(s_i, train_sets)
                j_ind[i] = 0
                ind = np.where(j_ind == 1)
                j_len = len(ind[0])
                if (j_len > 0):
                    row_ind.extend([i] * j_len)
                    col_ind.extend(ind[0])
                    # print(i, train_attrs[i], j_len)

            pk = csr_matrix(
                (np.ones(len(row_ind), dtype='int8'), (row_ind, col_ind)),
                shape=(len(test_attrs), len(train_attrs)))

            return pk
Esempio n. 17
0
def load_csr_graph(filename):
    """
    Loads graph from a file. Every line is of the format "V_origin,V_destination,Edge_weight".
    Returns a scipy.sparse.csr_matrix with the data.
    """
    raw = np.genfromtxt(filename, delimiter = ",", dtype = np.int32)
    sp_raw = csr_matrix((raw[:,2],(raw[:,0],raw[:,1])))
Esempio n. 18
0
def vectorize_dic(
        dic,
        ix=None,
        p=None,
        n=0,
        g=0):  #将数据处理成一个矩阵作为输入,矩阵的大小是用户数 × 电影数,使用的是scipy.sparse中的csr.csr_matrix
    """
    dic -- dictionary of feature lists. Keys are the name of features
    ix -- index generator (default None)
    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)
    """
    if ix == None:
        ix = dict()

    nz = n * g

    col_ix = np.empty(nz, dtype=int)

    i = 0
    for k, lis in dic.items():
        for t in range(len(lis)):
            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k), 0) + 1
            col_ix[i + t * g] = ix[str(lis[t]) + str(k)]
        i += 1

    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)
    if p == None:
        p = len(ix)

    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])),
                          shape=(n, p)), ix
Esempio n. 19
0
def vectorize(lil, ix=None, p=None):
    """ 
    Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) 
    
    parameters:
    -----------
    lil -- list of lists (dimension of inner lists should be the same)
    ix -- index generator (default None)
    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)
    """
    if (ix == None):
        ix = defaultdict(count(0).next)

    n = len(lil[0])  # num samples
    g = len(lil)  # num groups
    nz = n * g  # number of non-zeros

    col_ix = np.empty(nz, dtype=int)

    for i, d in enumerate(lil):
        # append index k with __i in order to prevet mapping different columns with same id to same index
        col_ix[i::g] = [ix[str(k) + '__' + str(i)] for k in d]

    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)

    if (p == None):
        p = len(ix)

    # only features that are less than p (siz of feature vector) are considered
    ixx = np.where(col_ix < p)

    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])),
                          shape=(n, p)), ix
def vectorize_dic(dic, label2index=None, hold_num=None):
    if (label2index == None):
        d = count(0)
        label2index = defaultdict(lambda: next(d))

    sample_num = len(list(dic.values())[0])  # num samples
    feat_num = len(list(dic.keys()))  # num of features
    total_value_num = sample_num * feat_num  # number of non-zeros

    col_ix = np.empty(total_value_num, dtype=int)

    i = 0
    for k, lis in dic.items():
        col_ix[i::feat_num] = [label2index[str(el) + str(k)] for el in lis]
        i += 1

    row_ix = np.repeat(np.arange(sample_num), feat_num)
    data = np.ones(total_value_num)

    if (hold_num == None):
        hold_num = len(label2index)

    left_data_index = np.where(col_ix < hold_num)

    return csr.csr_matrix((data[left_data_index],
                           (row_ix[left_data_index], col_ix[left_data_index])),
                          shape=(sample_num, hold_num)), label2index
Esempio n. 21
0
def statistical_inefficiencies(dtrajs,
                               lag,
                               C=None,
                               truncate_acf=True,
                               mact=2.0):
    """ Computes statistical inefficiencies of sliding-window transition counts at given lag

    Consider a discrete trajectory :math`{ x_t }` with :math:`x_t \in {1, ..., n}`. For each starting state :math:`i`,
    we collect the target sequence

    .. mathh:
        Y^(i) = {x_{t+\tau} | x_{t}=i}

    which contains the time-ordered target states at times :math:`t+\tau` whenever we started in state :math:`i`
    at time :math:`t`. Then we define the indicator sequence:

    .. math:
        a^{(i,j)}_t (\tau) = 1(Y^(i)_t = j)

    The statistical inefficiency for transition counts :math:`c_{ij}(tau)` is computed as the statistical inefficiency
    of the sequence :math:`a^{(i,j)}_t (\tau)`.

    Parameters
    ----------
    dtrajs : list of int-iterables
        discrete trajectories
    lag : int
        lag time
    C : scipy sparse matrix (n, n) or None
        sliding window count matrix, if already available
    truncate_acf : bool, optional, default=True
        When the normalized autocorrelation function passes through 0, it is truncated in order to avoid integrating
        random noise

    Returns
    -------
    I : scipy sparse matrix (n, n)
        Statistical inefficiency matrix with a sparsity pattern identical to the sliding-window count matrix at the
        same lag time. Will contain a statistical inefficiency :math:`I_{ij} \in (0,1]` whenever there is a count
        :math:`c_{ij} > 0`. When there is no transition count (:math:`c_{ij} = 0`), the statistical inefficiency is 0.

    See also
    --------
    msmtools.util.statistics.statistical_inefficiency
        used to compute the statistical inefficiency for conditional trajectories

    """
    # count matrix
    if C is None:
        C = count_matrix_coo2_mult(dtrajs, lag, sliding=True, sparse=True)
    # split sequences
    splitseq = _split_sequences_multitraj(dtrajs, lag)
    # compute inefficiencies
    I, J = C.nonzero()
    it = (statistical_inefficiency(_indicator_multitraj(splitseq, i, j),
                                   truncate_acf=truncate_acf,
                                   mact=mact) for i, j in zip(I, J))
    data = np.fromiter(it, dtype=float, count=C.nnz)
    res = csr_matrix((data, (I, J)), shape=C.shape)
    return res
Esempio n. 22
0
    def fetch(self, id_dict:dict, del_other=False, norm=False):
        train_cat_set = set()
        for id in id_dict['train']:
            for x in self.data[id]:
                train_cat_set.add(x)

        output = {}
        for split, id_list in id_dict.items():
            row, col, val = [], [], []
            for i, id in enumerate(id_list):
                num = len(self.data[id])
                for x in self.data[id]:
                    xx = x
                    if not xx in train_cat_set:
                        xx = 0
                    if xx != 0 or del_other == False:
                        row.append(i)
                        col.append(xx)
                        if norm:
                            val.append(1.0 * self.id2val.get(xx, 1) / num)
                        else:
                            val.append(self.id2val.get(xx, 1))
            output[split] = csr_matrix((np.asarray(val), (np.asarray(row), np.asarray(col))),
                                       shape=(len(id_list), self.length))
        return output
Esempio n. 23
0
def create_csr_matrix(dic, index=None, dim=None):
    '''
    将数据集的原始列表输入转为一个csr矩阵
    :param dic:
    :param index:
    :param dim:
    :return:
    '''
    if index == None:
        d = count(0)  #创建一个从0开始,step为1的无限迭代器
        index = defaultdict(
            lambda: next(d))  #defaultdict的作用是当key不存在的时候,不报错而返回默认值
    sample_num = len(list(dic.values())[0])  #样本数:90570
    feature_num = len(list(dic.keys()))  #特征数:2
    total_num = sample_num * feature_num

    col_ix = np.empty(total_num, dtype=int)
    i = 0
    for k, lis in dic.items():
        col_ix[i::feature_num] = [index[str(k) + str(el)] for el in lis]
        i += 1

    row_ix = np.repeat(np.arange(sample_num),
                       feature_num)  #每一个元素重复feature_num次
    data = np.ones(total_num)

    if dim is None:
        dim = len(index)

    left_data_index = np.where(col_ix < dim)
    return csr.csr_matrix((data[left_data_index],
                           (row_ix[left_data_index], col_ix[left_data_index])),
                          shape=(sample_num, dim)), index
Esempio n. 24
0
def vectorize_dic(dic, label2index=None, hold_num=None):
    if label2index == None:
        d = count(0)
        label2index = defaultdict(lambda: next(d))  # 数值映射表

    sample_num = len(list(dic.values())[0])  # 样本数
    feat_num = len(list(dic.keys()))  # 特征数
    total_value_num = sample_num * feat_num

    # 依给定的shape, 和数据类型 dtype, 返回一个一维或者多维数组,数组的元素不为空,为随机产生的数据
    col_ix = np.empty(total_value_num, dtype=int)

    i = 0
    for k, lis in dic.items():
        col_ix[i::feat_num] = [label2index[str(k) + str(el)] for el in lis]
        i += 1
    print("col_ix.shape:", col_ix.shape)
    print("col_ix:", col_ix)
    print(col_ix[0])

    row_ix = np.repeat(np.arange(sample_num), feat_num)
    data = np.ones(total_value_num)

    if hold_num is None:
        hold_num = len(label2index)

    left_data_index = np.where(col_ix < hold_num)  # 为了剔除不在train set中出现的test set数据

    return csr.csr_matrix(
        (data[left_data_index], (row_ix[left_data_index], col_ix[left_data_index])),
        shape=(sample_num, hold_num)), label2index
Esempio n. 25
0
def vectorize(lil, ix=None, p=None):
    """
    dic -- dictionary of feature lists. Keys are the name of features
    ix -- index generator (default None)
    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)
    n -- number of samples
    g -- number of groups
    """
    if ix == None:
        ix = defaultdict(count(0))
    n = len(lil[0])  # num samples
    g = len(lil)  # num groups
    nz = n * g

    col_ix = np.empty(nz, dtype=int)

    for i, d in enumerate(lil):
        # append index k with __i in order to prevet mapping different columns with same id to same index
        col_ix[i::g] = [ix[str(k) + '__' + str(i)] for k in d]

    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)

    if p == None:
        p = len(ix)

    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])),
                          shape=(n, p)), ix
Esempio n. 26
0
def test_from_csr1():

    from siconos.numerics import SBM_from_csparse, SBM_get_value
    from scipy.sparse.csr import csr_matrix

    M = csr_matrix([[1,2,3],
                    [4,5,6],
                    [7,8,9]])

    print(M.indices)
    print(M.indptr)
    print(M.data)
    
    blocksize =3

    r,SBM = SBM_from_csparse(blocksize,M)

    assert SBM_get_value(SBM,0,0) == 1
    assert SBM_get_value(SBM,0,1) == 2
    assert SBM_get_value(SBM,0,2) == 3
    assert SBM_get_value(SBM,1,0) == 4
    assert SBM_get_value(SBM,1,1) == 5
    assert SBM_get_value(SBM,1,2) == 6
    assert SBM_get_value(SBM,2,0) == 7
    assert SBM_get_value(SBM,2,1) == 8
    assert SBM_get_value(SBM,2,2) == 9
def data_processing_hiv(name):
    # requesting for data in json format (it is stored as python dictionary)
    data_fr_predict = request.json
    # creating pandas dataframe from python dictionary
    data = pd.DataFrame(data_fr_predict, index=[0])

    # loading pickled object of random  forest classifier model for HIV dataset
    file = open('DataProcessingHIV.pkl', 'rb')
    feature = joblib.load(file)
    label = joblib.load(file)
    one_hot_encode = joblib.load(file)
    file.close()

    # separating string of 8 characters into 8 different features
    x_values = separate_feature_column(data, feature)

    # using encoding same as encoding used while data pre-processing prior to model building
    x_values = one_hot_encode.transform(x_values)

    # for GaussianNB model converting sparse matrix to dense matrix
    if name == 'GaussianNB':
        x_values = csr_matrix(x_values).todense()

    y_values = data[label].values

    return x_values, y_values
Esempio n. 28
0
 def test_build_perm_docs(self):
     perm_vectors = csr_matrix([
         [1, 0, 1],
         [0, 1, 1]
     ])
     api_vectors = csr_matrix([
         [1,0],
         [0,1]
     ])
     perm_docs = np.array([
         [1, 0],
         [0, 1],
         [1, 1]
     ])
     result = PerRecCBR.build_perm_docs(perm_vectors, api_vectors)
     assert_array_equal(perm_docs, result)
Esempio n. 29
0
def vectorize_dic(dic, ix=None, p=None, n=0, g=0):
    """
    dic -- dictionary of feature lists. Keys are the name of features
    ix -- index generator (default None)
    p -- dimension of feature space (number of columns in the sparse matrix) (default None)
    """
    if ix == None:
        ix = dict()
    # 用户数 * 电影数
    nz = n * g

    col_ix = np.empty(nz, dtype=int)

    i = 0
    for k, lis in dic.items():
        for t in range(len(lis)):
            # 如果取到该值,则该值加1
            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t]) + str(k), 0) + 1
            # 取得相应位置的数字
            col_ix[i + t * g] = ix[str(lis[t]) + str(k)]
        i += 1

    row_ix = np.repeat(np.arange(0, n), g)

    # print('col_ix', col_ix)
    data = np.ones(nz)
    if p == None:
        p = len(ix)

    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])),
                          shape=(n, p)), ix
Esempio n. 30
0
    def test_mul_sparse_matrix(self):

        # test unsymmetric times unsymmetric
        m = self.basic_m
        dense_m = m.toarray()
        res = m * m
        dense_res = np.matmul(dense_m, dense_m)
        self.assertFalse(res.is_symmetric)
        self.assertTrue(np.allclose(res.toarray(), dense_res))

        # test symmetric result
        m = self.basic_m
        dense_m = m.toarray()
        res = m.transpose() * m
        dense_res = np.matmul(dense_m.transpose(), dense_m)
        self.assertTrue(res.is_symmetric)
        self.assertTrue(np.allclose(res.toarray(), dense_res))

        # test unsymmetric with rectangular
        m = self.basic_m
        dense_m2 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])

        m2 = CSRMatrix(dense_m2)
        res = m * m2
        dense_res = np.matmul(m.toarray(), dense_m2)
        self.assertFalse(res.is_symmetric)
        self.assertTrue(np.allclose(res.toarray(), dense_res))

        # test unsymmetric with rectangular scipycsr
        m = self.basic_m
        dense_m2 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])

        m2 = csr_matrix(dense_m2)
        with self.assertRaises(Exception) as context:
            res = m * m2
Esempio n. 31
0
    def predict(self, X):
        #loaded_graph = tf.Graph()
        if self.pretrained_model_path:
            # Load model
            loader = tf.train.import_meta_graph(self.pretrained_model_path +
                                                '.meta')
            loader.restore(self.session, self.pretrained_model_path)

        elif self.validation_data_position:
            # Load model
            loader = tf.train.import_meta_graph(self._save_model_path +
                                                '.meta')
            loader.restore(self.session, self._save_model_path)

        prediction = np.zeros((X.shape[0], self.y.shape[1]))
        batch_generator = BatchGenerator(X, None, self.batch_size, False, True)
        prediction_steps = self._calc_num_steps(X)
        for i in range(prediction_steps):
            X_batch = batch_generator._batch_generator()
            preds = self._predict_batch(X_batch)
            binary_decided_preds = self._make_binary_decision(preds)
            prediction[i * self.batch_size:(i + 1) *
                       self.batch_size, :] = binary_decided_preds.todense()

        result = csr_matrix(prediction)

        # close the session, since no longer needed
        session.close()
        return result
Esempio n. 32
0
def vectorize_dic(dic, label2index=None, hold_num=None):
    if label2index == None:
        d = count(0)
        label2index = defaultdict(lambda: next(d))  # 数值映射表

    sample_num = len(list(dic.values())[0])  # 样本数
    feat_num = len(list(dic.keys()))  # 特征数
    total_value_num = sample_num * feat_num

    col_ix = np.empty(total_value_num, dtype=int)  # 列索引

    i = 0
    for k, lis in dic.items():
        col_ix[i::feat_num] = [label2index[str(k) + str(el)]
                               for el in lis]  # 'user'和'item'的映射
        i += 1

    row_ix = np.repeat(np.arange(sample_num), feat_num)

    data = np.ones(total_value_num)

    if hold_num is None:
        hold_num = len(label2index)

    left_data_index = np.where(
        col_ix < hold_num)  # 为了剔除不在train set中出现的test set数据

    return csr.csr_matrix((data[left_data_index],
                           (row_ix[left_data_index], col_ix[left_data_index])),
                          shape=(sample_num, hold_num)), label2index
Esempio n. 33
0
def dataProcess(dic, ix=None, p=None, n=0, g=0):
    """
    dic -- dictionary of feature lists. Keys are the name of features. 'user':train['user'].values
    ix -- index generator (default None)
    p -- dimension of feature space (the number of columns in the sparse matrix) (default None)
    """
    if ix is None:
        ix = dict()

    nz = n*g

    col_ix = np.empty(nz, dtype=int)

    i = 0
    for k, lis in dic.items():
        for t in range(len(lis)):
            ix[str(lis[t]) + str(k)] = ix.get(str(lis[t])+str(k), 0) + 1
            col_ix[i+t*g] = ix[str(lis[t]) + str(k)]
        i += 1

    row_ix = np.repeat(np.arange(0, n), g)
    data = np.ones(nz)
    if p is None:
        p = len(ix)

    ixx = np.where(col_ix < p)
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix
Esempio n. 34
0
    def _parse_sparse_matrix(self, matrix):
        from scipy.sparse import csr_matrix

        return csr_matrix(
            (list(matrix.data), list(matrix.indices), list(matrix.indptr)),
            shape=(matrix.number_of_rows, matrix.number_of_columns),
        )
Esempio n. 35
0
 def _deduplicate(self) -> Union[pd.DataFrame, pd.Series]:
     n = len(self._master)
     graph = csr_matrix((np.full(len(self._matches_list), 1),
                         (self._matches_list.master_side.to_numpy(),
                          self._matches_list.dupe_side.to_numpy())),
                        shape=(n, n))
     raw_group_id_of_master_id = pd.DataFrame({
         'raw_group_id':
         pd.Series(connected_components(csgraph=graph, directed=False)[1]),
         'master_id':
         self._master.index.to_series()
     })
     first_master_id_in_group = raw_group_id_of_master_id.groupby('raw_group_id')['master_id']\
         .first()\
         .rename('new_group_id')\
         .reset_index()
     new_group_id_of_master_id = first_master_id_in_group\
         .merge(raw_group_id_of_master_id, how='left', on='raw_group_id')\
         .sort_values('master_id')\
         .reset_index(drop=True)
     output = self._master[
         new_group_id_of_master_id.new_group_id].reset_index(drop=True)
     if self._master_id is None:
         return output
     else:
         output_id = self._master_id[
             new_group_id_of_master_id.new_group_id].reset_index(drop=True)
         return pd.concat([output_id, output], axis=1)
Esempio n. 36
0
def test_fcsr_matrix(m=None,
                     n=None,
                     k=None,
                     data_n=1,
                     density=.1,
                     sym=False,
                     return_np=True):
    while True:
        _k = k or np.random.randint(1, 10)
        if sym:
            _m = _n = m or np.random.randint(1, 100)
        else:
            _m = m or np.random.randint(1, 100)
            _n = n or np.random.randint(1, 100)

        rows_array, cols_array, data_array, shape = SNP_to_coo(_m,
                                                               _n,
                                                               density,
                                                               data_n=_k,
                                                               sym=sym)
        array_scipy = csr_matrix(
            (data_array.astype(FLOAT_STORAGE_np), (rows_array, cols_array)),
            shape)

        fcoo = coo_to_fcoo(rows_array, cols_array, data_array, shape)
        fcsr = fcoo_to_fcsr(fcoo, shape)
        array_sparse = fcsr_matrix(fcsr, shape)

        if return_np:
            array_np = array_sparse.to_array()
            yield array_np, array_sparse, array_scipy
        else:
            yield array_sparse, array_scipy
Esempio n. 37
0
def test_bcsr_matrix(m=None, n=None, density=.1, sym=False, return_np=True):
    while True:
        if sym:
            _m = _n = m or np.random.randint(1, 100)
        else:
            _m = m or np.random.randint(1, 100)
            _n = n or np.random.randint(1, 100)
        rows_array, cols_array, data_array, shape = SNP_to_coo(_m,
                                                               _n,
                                                               density,
                                                               data_n=1,
                                                               sym=sym)

        row_p, col_i = coo_to_bcsr(_m, len(rows_array), rows_array, cols_array)

        array_sparse = bcsr_matrix(row_p, col_i, (_m, _n))
        array_scipy = csr_matrix(
            (data_array.astype(FLOAT_STORAGE_np), (rows_array, cols_array)),
            shape)

        if return_np:
            array_np = array_scipy.toarray()
            yield array_np, array_sparse, array_scipy
        else:
            yield array_sparse, array_scipy
Esempio n. 38
0
File: fm.py Progetto: chenlighten/RS
def vectorize_dict(dic, dim=None):
    feature_num = len(list(dic.keys()))
    record_num = len(list(dic.items())[0][1])
    col_ix = np.zeros([feature_num*record_num])
    
    ix = {}
    i = 0
    for k in dic.keys():
        lis = dic[k]
        for t in range(len(lis)):
            ix[str(k) + str(lis[t])] = ix.get(str(k) + str(lis[t]), 0) + 1
            col_ix[t*feature_num + i] = ix[str(k) + str(lis[t])]
        i += 1

    # ix = {}
    # i = 0
    # count = 0
    # for k in dic.keys():
    #     lis = dic[k]
    #     for t in range(len(lis)):
    #         flag = str(k) + str(lis[t])
    #         if flag not in ix.keys():
    #             ix[flag] = count
    #             count += 1
    #         col_ix[t*feature_num + i] = ix[flag]
    
    if dim == None: dim = len(ix)
    row_ix = np.repeat(np.arange(0, record_num), feature_num)
    ixx = np.where(col_ix < dim)
    data = np.ones([feature_num*record_num])
    
    return csr.csr_matrix((data[ixx], (row_ix[ixx], col_ix[ixx])), shape=[record_num, dim]), ix
def predict_fn(input_object: str, model: Model):
  """ Takes parsed input and make predictions with the model loaded by model_fn.
      Perform prediction on the deserialized object, with the loaded model

  Parameters:
    input_object -- Deserialize request_body we can perform prediction on
    model        -- Tensorflow Model

  Return:
    predictions  -- Dict Object with Ordering Predicted by RecSys Model
  """  
  start_time        = time.time()

  topN              = 10

  # User UUID
  user_uuid         = input_object['user_uuid']

  # Get watched movies
  watched_movies_idx = [model.item_idx[i] for i in input_object['watched_movies']]
  
  # Inverse IDX to ID
  inv_item_idx = dict((v, k) for k, v in model.item_idx.items())    

  # Transfomation Data to Sparse Data
  data_input = csr_matrix((np.ones(len(watched_movies_idx)), 
                        (np.zeros(len(watched_movies_idx)), watched_movies_idx)),
                      shape=(1, model.input_dim)).toarray()
  data_pred  = model.predict(data_input)[0]

  print("--- Inference time: %s seconds ---" % (time.time() - start_time))
  
  # Sorted Recommender List
    
  idx_pred    = list(set(list(range(model.input_dim))) - set(watched_movies_idx))
  
  sorted_pred = dict(
                  sorted(
                      zip(
                          list(idx_pred), 
                          list(data_pred[idx_pred].astype(float))
                      ), 
                  key=lambda x: x[1],
              reverse=True))
      
  # Result Format
  result = {
    "status": "Ok",
    "evaluation": {
      "user_uuid":          input_object['user_uuid'],
      "watched_movies":     input_object['watched_movies'],
      "recommended_movie_ids": [inv_item_idx[i] for i in list(sorted_pred.keys())[:topN]],
      "scores":             list(sorted_pred.values())[:topN],
      "datetime":           datetime.utcnow().isoformat(sep='T', timespec='milliseconds'),
      "modelVersion":       model.version,
    }
  }

  return result
Esempio n. 40
0
    def test_BRKnnb_auto_optimize_k(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True)

        # noinspection PyUnusedLocal
        def fun(s, X, y_):
            return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]]

        BRKNeighborsClassifier._get_split = fun
        knn.fit(data, y)
        self.assertEquals(3, knn.n_neighbors)
        pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred)
def make_mtx_payload(df):
    if hasattr(df, "sparse"):
        sparse_mat = csr_matrix(df.sparse.to_coo())
    else:
        sparse_mat = vstack(x[0] for x in df.values)
    sink = BytesIO()
    mmwrite(sink, sparse_mat)
    return sink.getvalue()
 def predict(self, X):
     predictions = csr_matrix((X.shape[0], self.y.shape[1]))
     doc_to_neighborhood_dict = self._predict_scores(X)
     
     for i in range(0,X.shape[0]):
         for label, _ in doc_to_neighborhood_dict[str(i + 1)]:
             predictions[i, label] = 1
     return predictions
Esempio n. 43
0
def load_sparse_csr(filename):
    '''
    :param filename: str
    :return: scipy.sparse.csr.csr_matrix
    '''
    filename += '.npz'
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),shape = loader['shape'])
def load_h5_to_csr(filename):
    f = tables.open_file(filename, 'r')
    dest = f.root.destination.read()
    origin = f.root.origin.read()
    weight = f.root.weight.read()
    sp_raw = csr_matrix((weight, (origin,dest)))
    f.close()
    return sp_raw
Esempio n. 45
0
 def test_mp_gammai_sparse_parallel(self):
     """Test parallel version equivalent to serial version."""
     sim = csr_matrix(1. - self.dist)
     sim_s = mpgam_s(sim, 'similarity')
     sim_p = mutual_proximity_gammai(sim, 'similarity', mv=0, verbose=1)
     parallel_all_close_serial = np.allclose(sim_p.toarray(),
                                             sim_s.toarray())
     return self.assertTrue(parallel_all_close_serial)
Esempio n. 46
0
    def _compute_relations(self):
        logger.log(logging.INFO, "Computing relations")

        self.relations = {}
        contains = self._compute_contains()
        self.relations['contains'] = csr_matrix(contains)
        self.relations['contained'] = csr_matrix(self.relations['contains'].transpose())

        father = self._compute_father()

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            self.relations['father' + r] = dok_matrix(father[i])

        siblings = self._compute_siblings()
        self.relations['opposed'] = dok_matrix(siblings[0])
        self.relations['associated'] = dok_matrix(siblings[1])
        self.relations['crossed'] = dok_matrix(siblings[2])
        self.relations['twin'] = dok_matrix(siblings[3])

        # self._do_inhibitions()

        for i, r in enumerate(['_substance', '_attribute', '_mode']):
            self.relations['child' + r] = self.relations['father' + r].transpose()

        # self.relations['siblings'] = sum(siblings)
        # self.relations['inclusion'] = np.clip(self.relations['contains'] + self.relations['contained'], 0, 1)
        # self.relations['father'] = self.relations['father_substance'] + \
        #                            self.relations['father_attribute'] + \
        #                            self.relations['father_mode']
        # self.relations['child'] = self.relations['child_substance'] + \
        #                           self.relations['child_attribute'] + \
        #                           self.relations['child_mode']
        # self.relations['etymology'] = self.relations['father'] + self.relations['child']

        table = self._compute_table_rank(self.relations['contained'])
        for i in range(6):
            self.relations['table_%d'%i] = table[i]

        self.relations['identity'] = csr_matrix(np.eye(len(self.dictionary)))

        missing = {s for s in RELATIONS if s not in self.relations}
        if missing:
            raise ValueError("Missing relations : {%s}"%", ".join(missing))

        self.relations = {reltype: csr_matrix(self.relations[reltype]) for reltype in RELATIONS}
Esempio n. 47
0
def irconvolve(xc, x, y, h, 
        kernel=lambda r, h: numpy.exp(- 0.5 * (r / h) ** 2)):
    """ default kernel is gaussian
        exp - 1/2 * r / h
        xc has to be uniform!
    """
    xc, y, x, h = numpy.atleast_1d(xc, y, x, h)
    dxc = (xc[-1] - xc[0]) / (len(xc) - 1)
    support = 6

    #first remove those are too far off
    good = ((x + support * h > xc[0]) \
          & (x - support * h < xc[-1]))
    x = x[good]
    y = y[good]
    h = h[good]

    if len(h) > 0:
        # the real buffer is bigger than out to ease the normalization
        # still on the edge we are imperfect
        padding = int((2 * support + 1)* h.max() / dxc) + 1
        padding = max(padding, 2)
        buffer = numpy.zeros(shape=len(xc) + 2 * padding)
        paddedxc = numpy.empty(buffer.shape, dtype=xc.dtype)
        paddedxc[padding:-padding] = xc
        # here comes the requirement xc has to be uniform.
        paddedxc[:padding] = xc[0] - numpy.arange(padding, 0, -1) * dxc
        paddedxc[-padding:] = xc[-1] + numpy.arange(1, padding +1) * dxc
        out = buffer[padding:-padding]
        assert len(out) == len(xc)
        assert (paddedxc[1:] > paddedxc[:-1]).all()

        # slow. for uniform xc/paddedxc, we can do this faster than search
        start = paddedxc.searchsorted(x - support * h, side='left')
        end = paddedxc.searchsorted(x + support * h, side='left')

        # tricky part, build the csr matrix for the conv operator,
        # only for the non-zero elements (block diagonal)
        N = end - start + 1
        indptr = numpy.concatenate(([0], N.cumsum()))
        indices = numpy.repeat(start - indptr[:-1], N) + numpy.arange(N.sum())
        r = numpy.repeat(x, N) - paddedxc[indices]
        data = kernel(r, numpy.repeat(h, N))
        data[numpy.repeat(N==1, N)] = 1
        data[numpy.repeat(h==0, N)] = 1
        matrix = csr.csr_matrix((data, indices, indptr), 
                shape=(len(x), len(paddedxc)))
        norm = numpy.repeat(matrix.sum(axis=1).flat, N)
        data /= norm
        buffer[:] = matrix.transpose() * y
    else:
        out = numpy.zeros(shape=xc.shape, dtype=y.dtype)
    return out
def generate_data(case, sparse=False):
    """Generate regression/classification data."""
    bunch = None
    if case == "regression":
        bunch = datasets.load_boston()
    elif case == "classification":
        bunch = datasets.fetch_20newsgroups_vectorized(subset="all")
    X, y = shuffle(bunch.data, bunch.target)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    if sparse:
        X_train = csr_matrix(X_train)
        X_test = csr_matrix(X_test)
    else:
        X_train = np.array(X_train)
        X_test = np.array(X_test)
    y_test = np.array(y_test)
    y_train = np.array(y_train)
    data = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
    return data
Esempio n. 49
0
    def train(self, X, mean=None):
        print >> sys.stderr, 'WARNING: You should probably be using SparseMatPCA, ' \
            'unless your design matrix fits in memory.'

        n, d = X.shape
        # Can't subtract a sparse vector from a sparse matrix, apparently,
        # so here I repeat the vector to construct a matrix.
        mean = X.mean(axis=0)
        mean_matrix = csr_matrix(mean.repeat(n).reshape((d, n))).T
        X = X - mean_matrix

        super(SparsePCA, self).train(X, mean=numpy.asarray(mean).squeeze())
Esempio n. 50
0
    def __compute_sim_scores(self, refvec_matrix, allvecs_matrix, L2_norms, is_embeddings):
        contexts_sims = allvecs_matrix.dot(refvec_matrix)        
                
        if is_embeddings:
            contexts_sims = (contexts_sims + 1) / 2 # map cosine to [0,1]
            contexts_sims = np.reshape(contexts_sims, (len(contexts_sims), 1))
            contexts_sims = csr_matrix(contexts_sims.tolist())     
        if L2_norms != None:
            contexts_sims = contexts_sims.multiply(L2_norms)           
            refvec_dp = refvec_matrix.transpose().dot(refvec_matrix)
            refvec_L2_norm = refvec_dp.data.max()**0.5 if len(refvec_dp.data) > 0 else 1.0
            contexts_sims.data /= refvec_L2_norm # weights -1 <= cosine <= 1, but in practice greater than zero because all weights >= 0

        return contexts_sims
Esempio n. 51
0
def standardize_kinship_format(in_file, out_file):
    '''Read the upper-triangular part of the kinship matrix from in_file. Convert to the
    format expected by KinshipDao and write to out_file.
    
    Note: loads the entire file into memory.'''
    data = np.loadtxt(in_file, usecols=[2])
    n = int(((8 * len(data) + 1) ** 0.5 - 1) / 2)
    idx = np.array(list(it.chain.from_iterable(xrange(k, n) for k in xrange(n))))
    idx_ptr = np.concatenate(([0], np.cumsum(xrange(n, 0, -1))))
    A = csr_matrix((np.maximum(data, 1e-16), idx, idx_ptr), shape=(n, n))
    with open(out_file, 'wb') as f:
        f.write(' '.join(it.islice((x[1] for x in csv.reader(open(in_file, 'rb'), delimiter='\t')), n)) + '\n')
    with open(out_file, 'ab') as f:
        np.savetxt(f, (A + triu(A, 1).transpose()).data, fmt='%.16f')
    return A
Esempio n. 52
0
def test_SBM_from_csparse1():
    from siconos.numerics import SBM_from_csparse,SBM_get_value, SBM_new_from_file, SBM_print, SBM_to_sparse
    from scipy.sparse import csr_matrix, lil_matrix

    A = lil_matrix((100, 100))
    A.setdiag(range(100))
    A[0, :10] = range(10)
    A[1, 10:20] = A[0, :10]

    M = csr_matrix(A)

    v,SBM=SBM_from_csparse(2,M)

    for i in range(M.shape[0]):
        for j in range(M.shape[1]):
            assert abs(SBM_get_value(SBM,i,j) - M[i,j]) < eps
Esempio n. 53
0
def test_sparseToSBM1():
    from siconos.numerics import sparseToSBM,getValueSBM, newFromFileSBM, printSBM, SBMtoSparse
    from scipy.sparse import csr_matrix, lil_matrix

    A = lil_matrix((100, 100))
    A.setdiag(range(100))
    A[0, :10] = range(10)
    A[1, 10:20] = A[0, :10]

    M = csr_matrix(A)

    v,SBM=sparseToSBM(2,M)

    for i in range(M.shape[0]):
        for j in range(M.shape[1]):
            assert abs(getValueSBM(SBM,i,j) - M[i,j]) < eps
Esempio n. 54
0
 def fit_transform(self, raw_documents, dsmMatrix, vocabulary):
     """use the sum of DSM vectors of a document's words as that document vector.
     here dsmMatrix is the matrix used in DSM that contains the distributional representations of all vocabulary words.
     """
     x = None
     i = 0
     num_exception = 0
     docnumber = 0
     docMatrices = []
     oov = 0
     oovDoc = 0
     for doc in raw_documents:
         if docnumber % 1000 == 0:
             print "processing document number " + str(docnumber + 1)
         docnumber += 1
         if oov > docnumber:
             print "warning: " + str(oov) + " oov."
         docMatrix = None
         sequenceVectors = []
         words = doc.split()
         numAddedWords = 0
         for word in words:
             wordId = vocabulary.getindex(word)
             if not wordId:
                 oov += 1
                 continue
             # doc matrix is the sum of all its word vectors
             wordRepresentation = dsmMatrix.getSparseRow(wordId)
             sequenceVectors.append(wordRepresentation)
             numAddedWords += 1
             if numAddedWords == 20:
                 break
         if numAddedWords == 0:
             oovDoc += 1
         # if text is shorter than 20 words add some extra zero vectors
         while numAddedWords < 20:
             zeroSparse = csr_matrix((1, dsmMatrix.matrix.shape[1]))
             sequenceVectors.append(zeroSparse)
             numAddedWords += 1
         # hstack word vectors in sequence
         docMatrix = hstack(sequenceVectors)
         docMatrices.append(docMatrix)
     print "%d documents are completely out of vocabulary" % oovDoc
     print "vstacking matrices..."
     x = vstack(docMatrices)
     print "vstacking finished."
     return x
Esempio n. 55
0
    def train(self, X, mean=None):
        """
        .. todo::

            WRITEME
        """
        warnings.warn('You should probably be using SparseMatPCA, '
                      'unless your design matrix fits in memory.')

        n, d = X.shape
        # Can't subtract a sparse vector from a sparse matrix, apparently,
        # so here I repeat the vector to construct a matrix.
        mean = X.mean(axis=0)
        mean_matrix = csr_matrix(mean.repeat(n).reshape((d, n))).T
        X = X - mean_matrix

        super(SparsePCA, self).train(X, mean=numpy.asarray(mean).squeeze())
 def __init__( self , size , dim , m=np.array([]) , Consts=1.0 , f_inter=None ):
     super( LinearSpringConstrained , self ).__init__( size , dim , m , Consts , f_inter=f_inter )
     
     self.__dim = dim
     self.__size = size
     
     self.__K = Consts
     
     self.__A = np.zeros( ( size , dim ) )
     self.__F = np.zeros( ( size , dim ) )
     
     self.__Fm = dok.dok_matrix( ( size , size ) )
     self.__Fm2 = csr.csr_matrix( ( size , size ) )
             
     self.__M = np.zeros( ( size , 1 ) )
     if len(m) != 0 :
         self.set_masses( m )
def call_svd(ndarray_matrix, low_dims, logger, normalize=False):
    assert isinstance(logger, logging.Logger)
    assert isinstance(ndarray_matrix, numpy.ndarray)
    assert isinstance(low_dims, int)

    if normalize == True:
        processed_matrix = normalize_data(ndarray_matrix)
    else:
        processed_matrix = ndarray_matrix

    X = csr_matrix(processed_matrix)
    logger.info(u"original dims: {}".format(X.shape[1]))
    svd = TruncatedSVD(n_components=low_dims, random_state=0)
    X_input = svd.fit_transform(X)
    logger.info(u"after SVD dims: {}".format(X_input.shape[1]))

    return X_input
Esempio n. 58
0
    def predict(self, X):
        """
        Predicts the classes for the samples. Takes the top k classes with smallest distance.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Prediction vector, where n_samples in the number of samples and
            n_features is the number of features.
        """
        predictions = csr_matrix((X.shape[0], self.y.shape[1]), dtype=np.int)
        
        topNIndices, _ = self._get_closest_centroids(X)
        
        for entry, label_list in enumerate(topNIndices):
            for label in label_list:
                predictions[entry, label] = 1
        return predictions
Esempio n. 59
0
 def fit_transform(self, raw_documents, dsmMatrix, vocabulary):
     """use the sum of DSM vectors of a document's words as that document vector.
     here dsmMatrix is the matrix used in DSM that contains the distributional representations of all vocabulary words.
     """
     x = None
     i = 0
     num_exception = 0
     docnumber = 0
     docMatrices = []
     oov = 0
     oovDoc = 0
     for doc in raw_documents:
         if docnumber % 1000 == 0:
             print "processing document number " + str(docnumber + 1)
         docnumber += 1
         if oov > docnumber * 5:
             print "warning: " + str(oov) + " oov."
         docMatrix = None
         words = doc.split()
         for word in words:
             wordId = vocabulary.getindex(word)
             if not wordId:
                 oov += 1
                 continue
             # doc matrix is the sum of all its word vectors
             if docMatrix is not None:
                 docMatrix = docMatrix + dsmMatrix.getSparseRow(wordId)
             else:
                 docMatrix = dsmMatrix.getSparseRow(wordId)
         if docMatrix is None:
             print dsmMatrix.matrix.shape
             print "%d is the shape of dsmMatrix" % dsmMatrix.matrix.shape[1]
             docMatrix = csr_matrix((1, dsmMatrix.matrix.shape[1]))
             docMatrix[0, 0] = 0
             oovDoc += 1
         docMatrices.append(docMatrix)
     print "%d documents are completely out of vocabulary" % oovDoc
     print "vstacking matrices..."
     x = vstack(docMatrices)
     print "vstacking finished."
     return x