def copy(self, shallow=False): dmat = Sparse3DMatrix.copy(self) dmat.count = self.count.copy() dmat.lengths = self.lengths.copy() dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = dmat.shape if not shallow: dmat.__copy_names(self) dmat.__copy_group_info(self) return dmat
def __mul__(self, other): dmat = Sparse3DMatrix.__mul__(self, other) dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = dmat.shape if isinstance( other, (np.ndarray, csc_matrix, csr_matrix, coo_matrix, lil_matrix)): dmat.hname = self.hname dmat.rname = copy.copy(self.rname) dmat.rid = copy.copy(self.rid) dmat.num_groups = 0 else: dmat.__copy_names(self) dmat.__copy_group_info(self) return dmat
def combine(self, other, shallow=False): if self.finalized and other.finalized: dmat = Sparse3DMatrix.combine(self, other) dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = dmat.shape if self.count is not None and other.count is not None: dmat.count = np.concatenate((self.count, other.count)) if self.lengths is not None: dmat.lengths = copy.copy(self.lengths) if not shallow: dmat.hname = self.hname dmat.lname = copy.copy(self.lname) dmat.rname = np.concatenate((self.rname, other.rname)) dmat.lid = copy.copy(self.lid) dmat.rid = dict(zip(dmat.rname, np.arange(dmat.num_reads))) dmat.__copy_group_info(self) return dmat else: raise RuntimeError('Both matrices must be finalized.')
def bundle( self, reset=False, shallow=False): # Copies the original matrix (Use lots of memory) """ Returns ``AlignmentPropertyMatrix`` object in which loci are bundled using grouping information. :param reset: whether to reset the values at the loci :param shallow: whether to copy all the meta data """ if self.finalized: # if self.num_groups > 0: if self.groups is not None and self.gname is not None: grp_conv_mat = lil_matrix((self.num_loci, self.num_groups)) for i in xrange(self.num_groups): grp_conv_mat[self.groups[i], i] = 1.0 grp_align = Sparse3DMatrix.__mul__( self, grp_conv_mat) # The core of the bundling grp_align.num_loci = self.num_groups grp_align.num_haplotypes = self.num_haplotypes grp_align.num_reads = self.num_reads grp_align.shape = (grp_align.num_loci, grp_align.num_haplotypes, grp_align.num_reads) grp_align.count = self.count if not shallow: grp_align.lname = copy.copy(self.gname) grp_align.hname = self.hname grp_align.rname = copy.copy(self.rname) grp_align.sname = copy.copy(self.sname) grp_align.lid = dict( zip(grp_align.lname, np.arange(grp_align.num_loci))) grp_align.rid = copy.copy(self.rid) grp_align.sid = copy.copy(self.sid) if reset: grp_align.reset() return grp_align else: raise RuntimeError( 'No group information is available for bundling.') else: raise RuntimeError('The matrix is not finalized.')
def save(self, h5file, title=None, index_dtype='uint32', data_dtype=float, incidence_only=True, complib='zlib', shallow=False): Sparse3DMatrix.save(self, h5file=h5file, title=title, index_dtype=index_dtype, data_dtype=data_dtype, incidence_only=incidence_only, complib=complib) h5fh = tables.open_file(h5file, 'a') fil = tables.Filters(complevel=1, complib=complib) if self.lengths is not None: h5fh.create_carray(h5fh.root, 'lengths', obj=self.lengths, title='Transcript Lengths', filters=fil) if self.count is not None: if len(self.count.shape) == 1: # count is a vector h5fh.create_carray(h5fh.root, 'count', obj=self.count, title='Equivalence Class Counts', filters=fil) elif len(self.count.shape) == 2: # count is 2-dim matrix if not isinstance(self.count, csc_matrix): self.count = csc_matrix(self.count) self.count.eliminate_zeros() cgroup = h5fh.create_group( h5fh.root, 'count', 'Sparse matrix components for N matrix') h5fh.create_carray(cgroup, 'indptr', obj=self.count.indptr.astype(index_dtype), filters=fil) h5fh.create_carray(cgroup, 'indices', obj=self.count.indices.astype(index_dtype), filters=fil) h5fh.create_carray(cgroup, 'data', obj=self.count.data.astype(index_dtype), filters=fil) if not shallow: h5fh.set_node_attr(h5fh.root, 'hname', self.hname) h5fh.create_carray(h5fh.root, 'lname', obj=self.lname, title='Locus Names', filters=fil) if self.rname is not None: h5fh.create_carray(h5fh.root, 'rname', obj=self.rname, title='Read Names', filters=fil) if self.sname is not None: h5fh.create_carray(h5fh.root, 'sname', obj=self.sname, title='Sample Names', filters=fil) h5fh.flush() h5fh.close()
def __sub__(self, other): dmat = Sparse3DMatrix.__sub__(self, other) dmat.num_loci, dmat.num_haplotypes, dmat.num_reads = self.shape dmat.__copy_names(self) dmat.__copy_group_info(self) return dmat
def __init__(self, other=None, ecfile=None, h5file=None, datanode='/', metanode='/', shallow=False, shape=None, dtype=float, haplotype_names=None, locus_names=None, read_names=None, sample_names=None, grpfile=None): Sparse3DMatrix.__init__(self, other=other, h5file=h5file, datanode=datanode, shape=shape, dtype=dtype) self.num_loci, self.num_haplotypes, self.num_reads = self.shape self.num_samples = 0 self.num_groups = 0 self.count = None self.hname = None self.lname = None # locus name self.rname = None # read name self.sname = None # sample name (e.g. sample barcodes::cell barcodes) self.lid = None # locus ID self.rid = None # read ID self.sid = None # sample ID self.lengths = None # transcript lengths (or effective lengths) self.gname = None # group name self.groups = None # groups in terms of locus IDs if other is not None: # Use for copying from other existing AlignmentPropertyMatrix object if other.count is not None: self.count = copy.copy(other.count) if other.lengths is not None: self.lengths = copy.copy(other.lengths) if not shallow: self.__copy_names(other) self.__copy_group_info(other) elif ecfile is not None: with open(ecfile, 'rb') as f: ecformat = unpack('<i', f.read(4))[0] if ecformat == 2: self.num_haplotypes = unpack('<i', f.read(4))[0] hname = list() for hidx in range(self.num_haplotypes): hname_len = unpack('<i', f.read(4))[0] hname.append( unpack('<{}s'.format(hname_len), f.read(hname_len))[0].decode('utf-8')) self.hname = np.array(hname) # Get transcipt info self.num_loci = unpack('<i', f.read(4))[0] self.lengths = np.zeros( (self.num_loci, self.num_haplotypes), dtype=float) tname = list() for tidx in range(self.num_loci): tname_len = unpack('<i', f.read(4))[0] tname.append( unpack('<{}s'.format(tname_len), f.read(tname_len))[0].decode('utf-8')) for hidx in range(self.num_haplotypes): self.lengths[tidx, hidx] = unpack('<i', f.read(4))[0] self.lname = np.array(tname) self.lid = dict(zip(self.lname, np.arange(self.num_loci))) # Get sample info sname = list() self.num_samples = unpack('<i', f.read(4))[0] for sidx in range(self.num_samples): sname_len = unpack('<i', f.read(4))[0] sname.append( unpack('<{}s'.format(sname_len), f.read(sname_len))[0].decode('utf-8')) self.sname = np.array(sname) self.sid = dict( zip(self.sname, np.arange(self.num_samples))) # Read in alignment matrix info indptr_len = unpack('<i', f.read(4))[0] self.num_reads = indptr_len - 1 nnz = unpack('<i', f.read(4))[0] indptr_A = np.array( unpack('<{}i'.format(indptr_len), f.read(4 * indptr_len))) indices_A = np.array( unpack('<{}i'.format(nnz), f.read(4 * nnz))) data_A = np.array( unpack('<{}i'.format(nnz), f.read(4 * nnz))) # Read in EC count matrix info indptr_len = unpack('<i', f.read(4))[0] nnz = unpack('<i', f.read(4))[0] indptr_N = np.array( unpack('<{}i'.format(indptr_len), f.read(4 * indptr_len))) indices_N = np.array( unpack('<{}i'.format(nnz), f.read(4 * nnz))) data_N = np.array( unpack('<{}i'.format(nnz), f.read(4 * nnz))) # Populate class member variables for hidx in range(self.num_haplotypes - 1): data_A, data_A_rem = np.divmod(data_A, 2) self.data.append( csr_matrix((data_A_rem, indices_A, indptr_A), shape=(self.num_reads, self.num_loci))) self.data.append( csr_matrix((data_A, indices_A, indptr_A), shape=(self.num_reads, self.num_loci))) for hidx in range(self.num_haplotypes): self.data[hidx].eliminate_zeros() self.count = csc_matrix( (data_N, indices_N, indptr_N), shape=(self.num_reads, self.num_samples)) if self.num_samples == 1: self.count = self.count.todense().A.flatten() self.shape = (self.num_loci, self.num_haplotypes, self.num_reads) self.finalize() elif ecformat == 1: raise NotImplementedError elif ecformat == 0: raise TypeError('Format 0 is not supported anymore.') elif h5file is not None: # Use for loading from a pytables file h5fh = tables.open_file(h5file, 'r') if not shallow: self.hname = h5fh.get_node_attr(datanode, 'hname') self.lname = np.char.decode( h5fh.get_node(metanode, 'lname').read(), 'utf-8') self.lid = dict(zip(self.lname, np.arange(self.num_loci))) if h5fh.__contains__('%s' % (metanode + '/rname')): self.rname = np.char.decode( h5fh.get_node(metanode, 'rname').read(), 'utf-8') self.rid = dict(zip(self.rname, np.arange(self.num_reads))) if h5fh.__contains__('%s' % (metanode + '/sname')): self.sname = np.char.decode( h5fh.get_node(metanode, 'sname').read(), 'utf-8') self.sid = dict( zip(self.sname, np.arange(self.num_samples))) self.num_samples = len(self.sname) if h5fh.__contains__('%s' % (datanode + '/count')): try: self.count = h5fh.get_node(datanode, 'count').read() # Format-1 except tables.NoSuchNodeError as e: # Format-2 nmat_node = h5fh.get_node(datanode + '/count') indptr = h5fh.get_node(nmat_node, 'indptr').read() indices = h5fh.get_node(nmat_node, 'indices').read() data = h5fh.get_node(nmat_node, 'data').read() self.count = csc_matrix((data, indices, indptr), dtype=np.float64) self.num_samples = self.count.shape[1] if h5fh.__contains__('%s' % (datanode + '/lengths')): self.lengths = h5fh.get_node(datanode, 'lengths').read() h5fh.close() elif shape is not None: # Use for initializing an empty matrix if haplotype_names is not None: if len(haplotype_names) == self.num_haplotypes: self.hname = haplotype_names else: raise RuntimeError( 'The number of names does not match to the matrix shape.' ) if locus_names is not None: if len(locus_names) == self.num_loci: self.lname = np.array(locus_names) self.lid = dict(zip(self.lname, np.arange(self.num_loci))) else: raise RuntimeError( 'The number of names does not match to the matrix shape.' ) if read_names is not None: if len(read_names) == self.num_reads: self.rname = np.array(read_names) self.rid = dict(zip(self.rname, np.arange(self.num_reads))) else: raise RuntimeError( 'The number of names does not match to the matrix shape.' ) if sample_names is not None: self.sname = np.array(sample_names) self.sid = dict(zip(self.sname, np.arange(self.num_samples))) self.num_samples = len(sample_names) else: self.num_samples = 1 if grpfile is not None: self.__load_groups(grpfile)