Example #1
0
def blend_svd(mats, factors=None, k=50):
    '''
    Special optimized version of blend for doing just an SVD.

    Like matrix.svd, returns a triple of:

    - U as a dense labeled matrix
    - S, a dense vector representing the diagonal of Sigma
    - V as a dense labeled matrix

    '''

    if factors is None:
        factors = [blend_factor(mat) for mat in mats]

    # Align matrices.
    # FIXME: only works for fully labeleed matrices right now.
    # TODO: could micro-optimize by using the first ordered set's indices.
    from csc_utils.ordered_set import OrderedSet
    row_labels, row_mappings = OrderedSet(), []
    for mat in mats:
        row_mappings.append(
            np.array([row_labels.add(item) for item in mat.row_labels],
                     dtype=np.uint64))
    col_labels, col_mappings = OrderedSet(), []
    for mat in mats:
        col_mappings.append(
            np.array([col_labels.add(item) for item in mat.col_labels],
                     dtype=np.uint64))

    # Elide zero row tests, etc.

    from divisi2._svdlib import svd_sum
    from divisi2 import DenseMatrix
    Ut, S, Vt = svd_sum(mats, k, factors, row_mappings, col_mappings)
    U = DenseMatrix(Ut.T, row_labels, None)
    V = DenseMatrix(Vt.T, col_labels, None)
    return U, S, V
Example #2
0
    def load_model(self, filename):
        """
        Loads SVD transformation (U, Sigma and V matrices) from a ZIP file

        :param filename: path to the SVD matrix transformation (a ZIP file)
        :type filename: string
        """
        try:
            zip = zipfile.ZipFile(filename, allowZip64=True)
        except:
            zip = zipfile.ZipFile(filename + '.zip', allowZip64=True)
        # Options file
        options = dict()
        for line in zip.open('README'):
            data = line.strip().split('\t')
            options[data[0]] = data[1]
        try:
            k = int(options['k'])
        except:
            k = 100  #TODO: nasty!!!

        # Load U, S, and V
        """
        #Python 2.6 only:
        #self._U = loads(zip.open('.U').read())
        #self._S = loads(zip.open('.S').read())
        #self._V = loads(zip.open('.V').read())
        """
        try:
            self._U = loads(zip.read('.U'))
        except:
            matrix = fromfile(zip.extract('.U', TMPDIR))
            vectors = []
            i = 0
            while i < len(matrix) / k:
                v = DenseVector(matrix[k * i:k * (i + 1)])
                vectors.append(v)
                i += 1
            try:
                idx = [
                    int(idx.strip())
                    for idx in zip.read('.row_ids').split('\n') if idx
                ]
            except:
                idx = [
                    idx.strip() for idx in zip.read('.row_ids').split('\n')
                    if idx
                ]
            #self._U = DenseMatrix(vectors)
            self._U = DenseMatrix(vectors, OrderedSet(idx), None)
        try:
            self._V = loads(zip.read('.V'))
        except:
            matrix = fromfile(zip.extract('.V', TMPDIR))
            vectors = []
            i = 0
            while i < len(matrix) / k:
                v = DenseVector(matrix[k * i:k * (i + 1)])
                vectors.append(v)
                i += 1
            try:
                idx = [
                    int(idx.strip())
                    for idx in zip.read('.col_ids').split('\n') if idx
                ]
            except:
                idx = [
                    idx.strip() for idx in zip.read('.col_ids').split('\n')
                    if idx
                ]
            #self._V = DenseMatrix(vectors)
            self._V = DenseMatrix(vectors, OrderedSet(idx), None)

        self._S = loads(zip.read('.S'))

        # Shifts for Mean Centerer Matrix
        self._shifts = None
        if '.shifts.row' in zip.namelist():
            self._shifts = [
                loads(zip.read('.shifts.row')),
                loads(zip.read('.shifts.col')),
                loads(zip.read('.shifts.total'))
            ]
        self._reconstruct_matrix(shifts=self._shifts, force=True)
        self._reconstruct_similarity(force=True)