def standardize(x, M=None, S=None, REVERSE=None): """ Function that standardize the data Input: x: the data M: the mean vector V: the standard deviation vector Output: x: the standardize data M: the mean vector V: the standard deviation vector """ if not sp.issubdtype(x.dtype, float): do_convert = 1 else: do_convert = 0 if REVERSE is None: if M is None: M = sp.mean(x, axis=0) S = sp.std(x, axis=0) if do_convert: xs = (x.astype("float") - M) / S else: xs = (x - M) / S return xs, M, S else: if do_convert: xs = (x.astype("float") - M) / S else: xs = (x - M) / S return xs else: return S * x + M
def scale(x, M=None, m=None, REVERSE=None): """ Function that standardize the data Input: x: the data M: the Max vector m: the Min vector Output: x: the standardize data M: the Max vector m: the Min vector """ if not sp.issubdtype(x.dtype, float): do_convert = 1 else: do_convert = 0 if REVERSE is None: if M is None: M = sp.amax(x, axis=0) m = sp.amin(x, axis=0) if do_convert: xs = 2 * (x.astype("float") - m) / (M - m) - 1 else: xs = 2 * (x - m) / (M - m) - 1 return xs, M, m else: if do_convert: xs = 2 * (x.astype("float") - m) / (M - m) - 1 else: xs = 2 * (x - m) / (M - m) - 1 return xs else: return (1 + x) / 2 * (M - m) + m
def scale(x, M=None, m=None, REVERSE=None): ''' Function that standardize the data Input: x: the data M: the Max vector m: the Min vector Output: x: the standardize data M: the Max vector m: the Min vector ''' if not sp.issubdtype(x.dtype, float): do_convert = 1 else: do_convert = 0 if REVERSE is None: if M is None: M = sp.amax(x, axis=0) m = sp.amin(x, axis=0) if do_convert: xs = 2 * (x.astype('float') - m) / (M - m) - 1 else: xs = 2 * (x - m) / (M - m) - 1 return xs, M, m else: if do_convert: xs = 2 * (x.astype('float') - m) / (M - m) - 1 else: xs = 2 * (x - m) / (M - m) - 1 return xs else: return (1 + x) / 2 * (M - m) + m
def scale(self, x, M=None, m=None): # TODO: DO IN PLACE SCALING """!@brief Function that standardize the data Input: x: the data M: the Max vector m: the Min vector Output: x: the standardize data M: the Max vector m: the Min vector """ [n, d] = x.shape if not sp.issubdtype(x.dtype, float): x = x.astype('float') # Initialization of the output xs = sp.empty_like(x) # get the parameters of the scaling if M is None: M, m = sp.amax(x, axis=0), sp.amin(x, axis=0) den = M - m for i in range(d): if den[i] != 0: xs[:, i] = 2 * (x[:, i] - m[i]) / den[i] - 1 else: xs[:, i] = x[:, i] return xs
def standardize(x, M=None, S=None, REVERSE=None): ''' Function that standardize the data Input: x: the data M: the mean vector V: the standard deviation vector Output: x: the standardize data M: the mean vector V: the standard deviation vector ''' if not sp.issubdtype(x.dtype, float): do_convert = 1 else: do_convert = 0 if REVERSE is None: if M is None: M = sp.mean(x, axis=0) S = sp.std(x, axis=0) if do_convert: xs = (x.astype('float') - M) / S else: xs = (x - M) / S return xs, M, S else: if do_convert: xs = (x.astype('float') - M) / S else: xs = (x - M) / S return xs else: return S * x + M
def scale(self,x,M=None,m=None): # TODO: DO IN PLACE SCALING """!@brief Function that standardize the data Input: x: the data M: the Max vector m: the Min vector Output: x: the standardize data M: the Max vector m: the Min vector """ [n,d]=x.shape if not sp.issubdtype(x.dtype,float): x=x.astype('float') # Initialization of the output xs = sp.empty_like(x) # get the parameters of the scaling if M is None: M,m = sp.amax(x,axis=0),sp.amin(x,axis=0) den = M-m for i in range(d): if den[i] != 0: xs[:,i] = 2*(x[:,i]-m[i])/den[i]-1 else: xs[:,i]=x[:,i] return xs
def _compare_gene(a, b): if sp.issubdtype(a.strain.dtype, sp.str_): _astrain = _codeUTF8(a.strain) else: _astrain = a.strain if sp.issubdtype(b.strain.dtype, sp.str_): _bstrain = _codeUTF8(b.strain) else: _bstrain = b.strain return ((a.chr == b.chr) & (a.strand == b.strand) & (sp.all(a.exons1 == b.exons1)) & (sp.all(a.exons2 == b.exons2)) & (sp.all(_astrain == _bstrain)) & (a.event_type == b.event_type) & (a.gene_idx == b.gene_idx) & (a.num_detected == b.num_detected))
assert sp.array_equal(sp.array([1, 2, 3], dtype=sp.int_), sp.int_([1, 2, 3])) # Different types evaluate to equal sp.arrays assert sp.array_equal(sp.array([1, 2, 3], dtype=sp.int_), sp.array([1, 2, 3], dtype=sp.float_)) # Get type v = sp.array([1, 2], dtype=sp.int32) assert v.dtype == sp.int32 # Subtype: sp.issubdtype(sp.int32, sp.int_) # Convert type v = sp.array([1, 2], dtype=sp.int32) vf = v.astype(sp.float_) assert vf.dtype == sp.float_ ### type_ vs dtype # `type_` is the same as using the dtype arg. # That said, *always use the sp.array* methods without dtye for uniformity # And if you need explicit type, use the dtype arg.
def count_graph_coverage_wrapper(fname_in, fname_out, options, sample_idx=None, qmode='all'): (genes, inserted) = pickle.load(open(fname_in, 'rb')) for g in genes: g.from_sparse() if genes[0].segmentgraph is None or genes[0].segmentgraph.is_empty(): for g in genes: g.segmentgraph = Segmentgraph(g) g.to_sparse() pickle.dump((genes, inserted), open(fname_in, 'wb'), -1) for g in genes: g.from_sparse() counts = dict() counts['segments'] = [] counts['seg_pos'] = [] counts['gene_ids_segs'] = [] counts['edges'] = [] counts['gene_ids_edges'] = [] counts['seg_len'] = sp.hstack([ x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :] for x in genes ]).T counts['gene_names'] = sp.array([x.name for x in genes], dtype='str') if not options.pyproc: if options.merge == 'single': print('\nprocessing %s' % (options.samples[sample_idx])) counts_tmp = count_graph_coverage(genes, options.bam_fnames[sample_idx], options) elif options.merge == 'merge_graphs' and qmode == 'single': print( '\nquantifying merged graph in single mode (first file only) on %s' % options.samples[0]) counts_tmp = count_graph_coverage(genes, options.bam_fnames[0], options) else: for s_idx in range(options.strains.shape[0]): print('\n%i/%i' % (s_idx + 1, options.strains.shape[0])) if s_idx == 0: counts_tmp = count_graph_coverage( genes, options.bam_fnames[s_idx], options) else: counts_tmp = sp.r_[ sp.atleast_2d(counts_tmp), count_graph_coverage(genes, options. bam_fnames[s_idx], options)] for c in range(counts_tmp.shape[1]): counts['segments'].append( sp.hstack( [sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]])) counts['seg_pos'].append( sp.hstack( [sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]])) counts['gene_ids_segs'].append( sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * c) tmp = [ sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0 ] if len(tmp) == 0: continue tmp = sp.hstack(tmp) if tmp.shape[0] > 0: counts['edges'].append( sp.c_[tmp[:, 0], tmp[:, sp.arange(1, tmp.shape[1], 2)]]) counts['gene_ids_edges'].append( sp.ones((tmp.shape[0], 1), dtype='int') * c) ### write result data to hdf5 for key in counts: counts[key] = sp.vstack( counts[key]) if len(counts[key]) > 0 else counts[key] counts['edge_idx'] = counts['edges'][:, 0] if len( counts['edges']) > 0 else sp.array([]) counts['edges'] = counts['edges'][:, 1:] if len( counts['edges']) > 0 else sp.array([]) h5fid = h5py.File(fname_out, 'w') h5fid.create_dataset(name='strains', data=codeUTF8(options.strains)) for key in counts: if sp.issubdtype(counts[key].dtype, sp.str_): h5fid.create_dataset(name=key, data=codeUTF8(counts[key])) else: h5fid.create_dataset(name=key, data=counts[key]) h5fid.close() else: ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains) if options.sparse_bam: chunksize = int(max(1, math.floor(1000000 / len(options.strains)))) else: chunksize = int(max(1, math.floor(100000 / len(options.strains)))) jobinfo = [] PAR = dict() PAR['options'] = options if options.merge == 'single': PAR['options'].bam_fnames = PAR['options'].bam_fnames[sample_idx] PAR['options'].samples = PAR['options'].samples[sample_idx] PAR['options'].strains = PAR['options'].strains[sample_idx] #s_idx = sp.argsort([x.chr for x in genes]) # TODO s_idx = sp.arange(genes.shape[0]) for c_idx in range(0, s_idx.shape[0], chunksize): cc_idx = min(s_idx.shape[0], c_idx + chunksize) fn = re.sub(r'.hdf5$', '', fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx) if os.path.exists(fn): continue else: print('submitting chunk %i to %i (%i)' % (c_idx, cc_idx, s_idx.shape[0])) PAR['genes'] = genes[s_idx][c_idx:cc_idx] for gg in PAR['genes']: gg.to_sparse() PAR['fn_bam'] = options.bam_fnames PAR['fn_out'] = fn PAR['options'] = options jobinfo.append( rp.rproc('count_graph_coverage', PAR, 15000, options.options_rproc, 60 * 48)) rp.rproc_wait(jobinfo, 30, 1.0, -1) del genes ### merge results from count chunks if options.verbose: print('\nCollecting count data from chunks ...\n') print('writing data to %s' % fname_out) ### write data to hdf5 continuously h5fid = h5py.File(fname_out, 'w') h5fid.create_dataset(name='gene_names', data=codeUTF8(counts['gene_names'])) h5fid.create_dataset(name='seg_len', data=counts['seg_len']) h5fid.create_dataset(name='strains', data=codeUTF8(options.strains)) for c_idx in range(0, s_idx.shape[0], chunksize): cc_idx = min(s_idx.shape[0], c_idx + chunksize) if options.verbose: print('collecting chunk %i-%i (%i)' % (c_idx, cc_idx, s_idx.shape[0])) fn = re.sub(r'.hdf5$', '', fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx) if not os.path.exists(fn): print( 'ERROR: Not all chunks in counting graph coverage completed!', file=sys.stderr) sys.exit(1) else: counts_tmp = pickle.load(open(fn, 'rb')) for c in range(counts_tmp.shape[1]): if 'segments' in h5fid: appendToHDF5( h5fid, sp.hstack([ sp.atleast_2d(x.segments).T for x in counts_tmp[:, c] ]), 'segments') appendToHDF5( h5fid, sp.hstack([ sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c] ]), 'seg_pos') appendToHDF5( h5fid, sp.ones((sp.atleast_2d( counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (s_idx[c_idx + c]), 'gene_ids_segs') else: h5fid.create_dataset(name='segments', data=sp.hstack([ sp.atleast_2d(x.segments).T for x in counts_tmp[:, c] ]), chunks=True, compression='gzip', maxshape=(None, len(options.strains))) h5fid.create_dataset(name='seg_pos', data=sp.hstack([ sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c] ]), chunks=True, compression='gzip', maxshape=(None, len(options.strains))) h5fid.create_dataset( name='gene_ids_segs', data=sp.ones((sp.atleast_2d( counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (s_idx[c_idx + c]), chunks=True, compression='gzip', maxshape=(None, 1)) tmp = [ sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0 ] if len(tmp) == 0: continue tmp = sp.hstack(tmp) if tmp.shape[0] > 0: if 'edges' in h5fid: appendToHDF5(h5fid, tmp[:, sp.arange(1, tmp.shape[1], 2)], 'edges') appendToHDF5(h5fid, tmp[:, 0], 'edge_idx') appendToHDF5( h5fid, sp.ones((tmp.shape[0], 1), dtype='int') * (s_idx[c_idx + c]), 'gene_ids_edges') else: h5fid.create_dataset( name='edges', data=tmp[:, sp.arange(1, tmp.shape[1], 2)], chunks=True, compression='gzip', maxshape=(None, tmp.shape[1] / 2)) h5fid.create_dataset(name='edge_idx', data=tmp[:, 0], chunks=True, compression='gzip', maxshape=(None, )) h5fid.create_dataset( name='gene_ids_edges', data=sp.ones((tmp.shape[0], 1), dtype='int') * (s_idx[c_idx + c]), chunks=True, compression='gzip', maxshape=(None, 1)) del tmp, counts_tmp h5fid.close()
# Different types evaluate to equal sp.arrays assert sp.array_equal( sp.array([1, 2, 3], dtype = sp.int_ ), sp.array([1, 2, 3], dtype = sp.float_) ) # Get type v = sp.array([1,2], dtype = sp.int32) assert v.dtype == sp.int32 # Subtype: sp.issubdtype(sp.int32, sp.int_) # Convert type v = sp.array([1,2], dtype = sp.int32) vf = v.astype(sp.float_) assert vf.dtype == sp.float_ ### type_ vs dtype # `type_` is the same as using the dtype arg. # That said, *always use the sp.array* methods without dtye for uniformity # And if you need explicit type, use the dtype arg.