def copy(self, shallow=False):
     dmat = Sparse3DMatrix.copy(self)
     dmat.count = self.count.copy()
     dmat.lengths = self.lengths.copy()
     dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = dmat.shape
     if not shallow:
         dmat.__copy_names(self)
         dmat.__copy_group_info(self)
     return dmat
 def __mul__(self, other):
     dmat = Sparse3DMatrix.__mul__(self, other)
     dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = dmat.shape
     if isinstance(
             other,
         (np.ndarray, csc_matrix, csr_matrix, coo_matrix, lil_matrix)):
         dmat.hname = self.hname
         dmat.rname = copy.copy(self.rname)
         dmat.rid = copy.copy(self.rid)
         dmat.num_groups = 0
     else:
         dmat.__copy_names(self)
         dmat.__copy_group_info(self)
     return dmat
 def combine(self, other, shallow=False):
     if self.finalized and other.finalized:
         dmat = Sparse3DMatrix.combine(self, other)
         dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = dmat.shape
         if self.count is not None and other.count is not None:
             dmat.count = np.concatenate((self.count, other.count))
         if self.lengths is not None:
             dmat.lengths = copy.copy(self.lengths)
         if not shallow:
             dmat.hname = self.hname
             dmat.lname = copy.copy(self.lname)
             dmat.rname = np.concatenate((self.rname, other.rname))
             dmat.lid = copy.copy(self.lid)
             dmat.rid = dict(zip(dmat.rname, np.arange(dmat.num_reads)))
             dmat.__copy_group_info(self)
         return dmat
     else:
         raise RuntimeError('Both matrices must be finalized.')
    def bundle(
            self,
            reset=False,
            shallow=False):  # Copies the original matrix (Use lots of memory)
        """
        Returns ``AlignmentPropertyMatrix`` object in which loci are bundled using grouping information.

        :param reset: whether to reset the values at the loci
        :param shallow: whether to copy all the meta data
        """
        if self.finalized:
            # if self.num_groups > 0:
            if self.groups is not None and self.gname is not None:
                grp_conv_mat = lil_matrix((self.num_loci, self.num_groups))
                for i in xrange(self.num_groups):
                    grp_conv_mat[self.groups[i], i] = 1.0
                grp_align = Sparse3DMatrix.__mul__(
                    self, grp_conv_mat)  # The core of the bundling
                grp_align.num_loci = self.num_groups
                grp_align.num_haplotypes = self.num_haplotypes
                grp_align.num_reads = self.num_reads
                grp_align.shape = (grp_align.num_loci,
                                   grp_align.num_haplotypes,
                                   grp_align.num_reads)
                grp_align.count = self.count
                if not shallow:
                    grp_align.lname = copy.copy(self.gname)
                    grp_align.hname = self.hname
                    grp_align.rname = copy.copy(self.rname)
                    grp_align.sname = copy.copy(self.sname)
                    grp_align.lid = dict(
                        zip(grp_align.lname, np.arange(grp_align.num_loci)))
                    grp_align.rid = copy.copy(self.rid)
                    grp_align.sid = copy.copy(self.sid)
                if reset:
                    grp_align.reset()
                return grp_align
            else:
                raise RuntimeError(
                    'No group information is available for bundling.')
        else:
            raise RuntimeError('The matrix is not finalized.')
 def save(self,
          h5file,
          title=None,
          index_dtype='uint32',
          data_dtype=float,
          incidence_only=True,
          complib='zlib',
          shallow=False):
     Sparse3DMatrix.save(self,
                         h5file=h5file,
                         title=title,
                         index_dtype=index_dtype,
                         data_dtype=data_dtype,
                         incidence_only=incidence_only,
                         complib=complib)
     h5fh = tables.open_file(h5file, 'a')
     fil = tables.Filters(complevel=1, complib=complib)
     if self.lengths is not None:
         h5fh.create_carray(h5fh.root,
                            'lengths',
                            obj=self.lengths,
                            title='Transcript Lengths',
                            filters=fil)
     if self.count is not None:
         if len(self.count.shape) == 1:  # count is a vector
             h5fh.create_carray(h5fh.root,
                                'count',
                                obj=self.count,
                                title='Equivalence Class Counts',
                                filters=fil)
         elif len(self.count.shape) == 2:  # count is 2-dim matrix
             if not isinstance(self.count, csc_matrix):
                 self.count = csc_matrix(self.count)
                 self.count.eliminate_zeros()
             cgroup = h5fh.create_group(
                 h5fh.root, 'count',
                 'Sparse matrix components for N matrix')
             h5fh.create_carray(cgroup,
                                'indptr',
                                obj=self.count.indptr.astype(index_dtype),
                                filters=fil)
             h5fh.create_carray(cgroup,
                                'indices',
                                obj=self.count.indices.astype(index_dtype),
                                filters=fil)
             h5fh.create_carray(cgroup,
                                'data',
                                obj=self.count.data.astype(index_dtype),
                                filters=fil)
     if not shallow:
         h5fh.set_node_attr(h5fh.root, 'hname', self.hname)
         h5fh.create_carray(h5fh.root,
                            'lname',
                            obj=self.lname,
                            title='Locus Names',
                            filters=fil)
         if self.rname is not None:
             h5fh.create_carray(h5fh.root,
                                'rname',
                                obj=self.rname,
                                title='Read Names',
                                filters=fil)
         if self.sname is not None:
             h5fh.create_carray(h5fh.root,
                                'sname',
                                obj=self.sname,
                                title='Sample Names',
                                filters=fil)
     h5fh.flush()
     h5fh.close()
 def __sub__(self, other):
     dmat = Sparse3DMatrix.__sub__(self, other)
     dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = self.shape
     dmat.__copy_names(self)
     dmat.__copy_group_info(self)
     return dmat
    def __init__(self,
                 other=None,
                 ecfile=None,
                 h5file=None,
                 datanode='/',
                 metanode='/',
                 shallow=False,
                 shape=None,
                 dtype=float,
                 haplotype_names=None,
                 locus_names=None,
                 read_names=None,
                 sample_names=None,
                 grpfile=None):

        Sparse3DMatrix.__init__(self,
                                other=other,
                                h5file=h5file,
                                datanode=datanode,
                                shape=shape,
                                dtype=dtype)

        self.num_loci, self.num_haplotypes, self.num_reads = self.shape
        self.num_samples = 0
        self.num_groups = 0
        self.count = None
        self.hname = None
        self.lname = None  # locus name
        self.rname = None  # read name
        self.sname = None  # sample name (e.g. sample barcodes::cell barcodes)
        self.lid = None  # locus ID
        self.rid = None  # read ID
        self.sid = None  # sample ID
        self.lengths = None  # transcript lengths (or effective lengths)
        self.gname = None  # group name
        self.groups = None  # groups in terms of locus IDs

        if other is not None:  # Use for copying from other existing AlignmentPropertyMatrix object
            if other.count is not None:
                self.count = copy.copy(other.count)
            if other.lengths is not None:
                self.lengths = copy.copy(other.lengths)
            if not shallow:
                self.__copy_names(other)
                self.__copy_group_info(other)

        elif ecfile is not None:
            with open(ecfile, 'rb') as f:
                ecformat = unpack('<i', f.read(4))[0]
                if ecformat == 2:
                    self.num_haplotypes = unpack('<i', f.read(4))[0]
                    hname = list()
                    for hidx in range(self.num_haplotypes):
                        hname_len = unpack('<i', f.read(4))[0]
                        hname.append(
                            unpack('<{}s'.format(hname_len),
                                   f.read(hname_len))[0].decode('utf-8'))
                    self.hname = np.array(hname)
                    # Get transcipt info
                    self.num_loci = unpack('<i', f.read(4))[0]
                    self.lengths = np.zeros(
                        (self.num_loci, self.num_haplotypes), dtype=float)
                    tname = list()
                    for tidx in range(self.num_loci):
                        tname_len = unpack('<i', f.read(4))[0]
                        tname.append(
                            unpack('<{}s'.format(tname_len),
                                   f.read(tname_len))[0].decode('utf-8'))
                        for hidx in range(self.num_haplotypes):
                            self.lengths[tidx, hidx] = unpack('<i',
                                                              f.read(4))[0]
                    self.lname = np.array(tname)
                    self.lid = dict(zip(self.lname, np.arange(self.num_loci)))
                    # Get sample info
                    sname = list()
                    self.num_samples = unpack('<i', f.read(4))[0]
                    for sidx in range(self.num_samples):
                        sname_len = unpack('<i', f.read(4))[0]
                        sname.append(
                            unpack('<{}s'.format(sname_len),
                                   f.read(sname_len))[0].decode('utf-8'))
                    self.sname = np.array(sname)
                    self.sid = dict(
                        zip(self.sname, np.arange(self.num_samples)))
                    # Read in alignment matrix info
                    indptr_len = unpack('<i', f.read(4))[0]
                    self.num_reads = indptr_len - 1
                    nnz = unpack('<i', f.read(4))[0]
                    indptr_A = np.array(
                        unpack('<{}i'.format(indptr_len),
                               f.read(4 * indptr_len)))
                    indices_A = np.array(
                        unpack('<{}i'.format(nnz), f.read(4 * nnz)))
                    data_A = np.array(
                        unpack('<{}i'.format(nnz), f.read(4 * nnz)))
                    # Read in EC count matrix info
                    indptr_len = unpack('<i', f.read(4))[0]
                    nnz = unpack('<i', f.read(4))[0]
                    indptr_N = np.array(
                        unpack('<{}i'.format(indptr_len),
                               f.read(4 * indptr_len)))
                    indices_N = np.array(
                        unpack('<{}i'.format(nnz), f.read(4 * nnz)))
                    data_N = np.array(
                        unpack('<{}i'.format(nnz), f.read(4 * nnz)))
                    # Populate class member variables
                    for hidx in range(self.num_haplotypes - 1):
                        data_A, data_A_rem = np.divmod(data_A, 2)
                        self.data.append(
                            csr_matrix((data_A_rem, indices_A, indptr_A),
                                       shape=(self.num_reads, self.num_loci)))
                    self.data.append(
                        csr_matrix((data_A, indices_A, indptr_A),
                                   shape=(self.num_reads, self.num_loci)))
                    for hidx in range(self.num_haplotypes):
                        self.data[hidx].eliminate_zeros()
                    self.count = csc_matrix(
                        (data_N, indices_N, indptr_N),
                        shape=(self.num_reads, self.num_samples))
                    if self.num_samples == 1:
                        self.count = self.count.todense().A.flatten()
                    self.shape = (self.num_loci, self.num_haplotypes,
                                  self.num_reads)
                    self.finalize()
                elif ecformat == 1:
                    raise NotImplementedError
                elif ecformat == 0:
                    raise TypeError('Format 0 is not supported anymore.')

        elif h5file is not None:  # Use for loading from a pytables file
            h5fh = tables.open_file(h5file, 'r')
            if not shallow:
                self.hname = h5fh.get_node_attr(datanode, 'hname')
                self.lname = np.char.decode(
                    h5fh.get_node(metanode, 'lname').read(), 'utf-8')
                self.lid = dict(zip(self.lname, np.arange(self.num_loci)))
                if h5fh.__contains__('%s' % (metanode + '/rname')):
                    self.rname = np.char.decode(
                        h5fh.get_node(metanode, 'rname').read(), 'utf-8')
                    self.rid = dict(zip(self.rname, np.arange(self.num_reads)))
                if h5fh.__contains__('%s' % (metanode + '/sname')):
                    self.sname = np.char.decode(
                        h5fh.get_node(metanode, 'sname').read(), 'utf-8')
                    self.sid = dict(
                        zip(self.sname, np.arange(self.num_samples)))
                    self.num_samples = len(self.sname)
            if h5fh.__contains__('%s' % (datanode + '/count')):
                try:
                    self.count = h5fh.get_node(datanode,
                                               'count').read()  # Format-1
                except tables.NoSuchNodeError as e:  # Format-2
                    nmat_node = h5fh.get_node(datanode + '/count')
                    indptr = h5fh.get_node(nmat_node, 'indptr').read()
                    indices = h5fh.get_node(nmat_node, 'indices').read()
                    data = h5fh.get_node(nmat_node, 'data').read()
                    self.count = csc_matrix((data, indices, indptr),
                                            dtype=np.float64)
                    self.num_samples = self.count.shape[1]
            if h5fh.__contains__('%s' % (datanode + '/lengths')):
                self.lengths = h5fh.get_node(datanode, 'lengths').read()
            h5fh.close()

        elif shape is not None:  # Use for initializing an empty matrix
            if haplotype_names is not None:
                if len(haplotype_names) == self.num_haplotypes:
                    self.hname = haplotype_names
                else:
                    raise RuntimeError(
                        'The number of names does not match to the matrix shape.'
                    )
            if locus_names is not None:
                if len(locus_names) == self.num_loci:
                    self.lname = np.array(locus_names)
                    self.lid = dict(zip(self.lname, np.arange(self.num_loci)))
                else:
                    raise RuntimeError(
                        'The number of names does not match to the matrix shape.'
                    )
            if read_names is not None:
                if len(read_names) == self.num_reads:
                    self.rname = np.array(read_names)
                    self.rid = dict(zip(self.rname, np.arange(self.num_reads)))
                else:
                    raise RuntimeError(
                        'The number of names does not match to the matrix shape.'
                    )
            if sample_names is not None:
                self.sname = np.array(sample_names)
                self.sid = dict(zip(self.sname, np.arange(self.num_samples)))
                self.num_samples = len(sample_names)
            else:
                self.num_samples = 1

        if grpfile is not None:
            self.__load_groups(grpfile)