def _test_dna(self, chromo): pos = self.pos[self.chromo == chromo.encode()] dna = self.data['/inputs/dna'][self.chromo == chromo.encode()] dna_wlen = dna.shape[1] center = dna_wlen // 2 dna_seq = read_chromo(os.path.join(self.data_path, '../dna_db'), chromo) idxs = np.linspace(0, len(pos) - 1, 100).astype(np.int32) for idx in idxs: p = pos[idx] - 1 assert dna_seq[p:(p + 2)] == 'CG' assert dna[idx, center] == 3 assert dna[idx, center + 1] == 2 assert dna[idx, center + 10] == CHAR_TO_INT[dna_seq[p + 10]] assert dna[idx, center - 10] == CHAR_TO_INT[dna_seq[p - 10]]
def main(self, name, opts): if opts.seed is not None: np.random.seed(opts.seed) logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) # Check input arguments if not opts.cpg_profiles: if not (opts.pos_file or opts.dna_files): raise ValueError('Position table and DNA database expected!') if opts.dna_wlen and opts.dna_wlen % 2 == 0: raise '--dna_wlen must be odd!' if opts.cpg_wlen and opts.cpg_wlen % 2 != 0: raise '--cpg_wlen must be even!' # Parse functions for computing output statistics cpg_stats_meta = None win_stats_meta = None if opts.cpg_stats: cpg_stats_meta = get_stats_meta(opts.cpg_stats) if opts.win_stats: win_stats_meta = get_stats_meta(opts.win_stats) make_dir(opts.out_dir) outputs = OrderedDict() # Read single-cell profiles if provided if opts.cpg_profiles: log.info('Reading CpG profiles ...') outputs['cpg'] = read_cpg_profiles( opts.cpg_profiles, chromos=opts.chromos, nb_sample=opts.nb_sample, nb_sample_chromo=opts.nb_sample_chromo, log=log.info) # Create table with unique positions if opts.pos_file: #the pos_file provide the CpG positions which need to be predicted # Read positions from file log.info('Reading position table ...') pos_table = pd.read_table(opts.pos_file, usecols=[0, 1], dtype={ 0: str, 1: np.int32 }, header=None, comment='#') pos_table.columns = ['chromo', 'pos'] pos_table['chromo'] = dat.format_chromo(pos_table['chromo']) pos_table = prepro_pos_table(pos_table) else: # Extract positions from profiles, if not provided. Predict position which available in at least one cells. pos_tables = [] for cpg_table in list(outputs['cpg'].values()): pos_tables.append(cpg_table[['chromo', 'pos']]) pos_table = prepro_pos_table(pos_tables) if opts.chromos: pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)] if opts.nb_sample_chromo: pos_table = dat.sample_from_chromo(pos_table, opts.nb_sample_chromo) if opts.nb_sample: pos_table = pos_table.iloc[:opts.nb_sample] log.info('%d samples' % len(pos_table)) make_dir(opts.out_dir) # Iterate over chromosomes # ------------------------ for chromo in pos_table.chromo.unique(): log.info('-' * 80) log.info('Chromosome %s ...' % (chromo)) idx = pos_table.chromo == chromo ##idx is T/F for whether the entries are equal to the chromo chromo_pos = pos_table.loc[ idx].pos.values #a numpy array with 1D data chromo_outputs = OrderedDict() if 'cpg' in outputs: # Concatenate CpG tables into single nb_site x nb_output matrix chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo, chromo_pos) #chromo_outputs, one array called 'cpg', 'cpg' has #sample array, #each item is mapped table of target_pos with value filled #OrderedDict([('BS27_1_SER', array([1, 1, 1, ..., 1, 1, 0], dtype=int8)), #('BS27_3_SER', array([-1, 1, 1, ..., 1, -1, -1], dtype=int8))]) chromo_outputs['cpg_mat'] = np.vstack( list(chromo_outputs['cpg'].values())).T #add one more array to it. np.vstack, stack array sequence vertically #chromo_outputs['cpg_mat'].shape=(402166, 2) #402166 is the CHR1 target pos number, 2 is the input two samples, BS27_1_SER, BS27_3_SER assert len(chromo_outputs['cpg_mat']) == len(chromo_pos) if 'cpg_mat' in chromo_outputs and opts.cpg_cov: cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1) assert np.all(cov >= 1) idx = cov >= opts.cpg_cov tmp = '%s sites matched minimum coverage filter' tmp %= format_out_of(idx.sum(), len(idx)) log.info(tmp) if idx.sum() == 0: continue chromo_pos = chromo_pos[idx] chromo_outputs = select_dict(chromo_outputs, idx) # Read DNA of chromosome chromo_dna = None if opts.dna_files: #this will only read the corresponding chromosome sequence chromo_dna = fasta.read_chromo( opts.dna_files, chromo) #chromo_dna is string, len=195471971 for chr1 annos = None if opts.anno_files: log.info('Annotating CpG sites ...') annos = dict() for anno_file in opts.anno_files: name = split_ext(anno_file) annos[name] = annotate(anno_file, chromo, chromo_pos) # Iterate over chunks # ------------------- nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size)) for chunk in range(nb_chunk): log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk)) chunk_start = chunk * opts.chunk_size chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size) chunk_idx = slice(chunk_start, chunk_end) chunk_pos = chromo_pos[chunk_idx] chunk_outputs = select_dict(chromo_outputs, chunk_idx) #OrderedDict() #chunk_outputs is 1D array filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start, chunk_end) filename = os.path.join(opts.out_dir, filename) chunk_file = h5.File(filename, 'w') # Write positions chunk_file.create_dataset( 'chromo', shape=(len(chunk_pos), ), dtype='S2') #create_dataset() in default for h5py chunk_file['chromo'][:] = chromo.encode( ) #set the chunk_file['chromo'] = 1 for all. #chunk_file['chromo'].shape = (32768,) chunk_file.create_dataset('pos', data=chunk_pos, dtype=np.int32) #chunk_file['pos'].shape = (32768,) # the size is default chunk_size if len(chunk_outputs): #len(chunk_outputs)=2 out_group = chunk_file.create_group('outputs') #for now, type(out_group) = <class 'h5py._hl.group.Group'> #list(out_group) = [] # Write cpg profiles if 'cpg' in chunk_outputs: for name, value in six.iteritems(chunk_outputs['cpg']): #name = ["BS27_1_SER", 'BS27_3_SER'] # the sample name #value= 2 numpy array, both with shape=(32768,) assert len(value) == len(chunk_pos) # Round continuous values out_group.create_dataset('cpg/%s' % name, data=value.round(), dtype=np.int8, compression='gzip') #type(out_group)= <class 'h5py._hl.group.Group'> #list(out_group) = ['cpg'] #list(out_group['cpg']) = ['BS27_1_SER', 'BS27_3_SER'] # Compute and write statistics if cpg_stats_meta is not None: log.info('Computing per CpG statistics ...') cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'], dat.CPG_NAN) #cpg_mat.shape=(32768, 2) mask = np.sum(~cpg_mat.mask, axis=1) mask = mask < opts.cpg_stats_cov for name, fun in six.iteritems(cpg_stats_meta): stat = fun[0](cpg_mat).data.astype(fun[1]) stat[mask] = dat.CPG_NAN assert len(stat) == len(chunk_pos) out_group.create_dataset('cpg_stats/%s' % name, data=stat, dtype=fun[1], compression='gzip') #until here: #>>> chunk_file.visit(printname) #chromo #outputs #outputs/cpg #outputs/cpg/BS27_1_SER #utputs/cpg/BS27_3_SER #pos # Write input features in_group = chunk_file.create_group('inputs') # DNA windows if chromo_dna: log.info('Extracting DNA sequence windows ...') dna_wins = extract_seq_windows(chromo_dna, pos=chunk_pos, wlen=opts.dna_wlen) #give the fasta sequence of one chromosome ('chromo_dna'), and targeted position ('chunk_pos') #, and wlen=1001, return a numpy array with shape as (32768, 1001). The array has been transfered as #number rather than base pair assert len(dna_wins) == len(chunk_pos) in_group.create_dataset('dna', data=dna_wins, dtype=np.int8, compression='gzip') #>>> in_group.visit(printname) = dna # CpG neighbors if opts.cpg_wlen: log.info('Extracting CpG neighbors ...') cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2) context_group = in_group.create_group('cpg') # outputs['cpg'], since neighboring CpG sites might lie # outside chunk borders and un-mapped values are needed for name, cpg_table in six.iteritems(outputs['cpg']): #name="BS27_1_SER" and "BS27_3_SER" #cpg_table = numpy array, with three columns information for each input sample. cpg_table = cpg_table.loc[cpg_table.chromo == chromo] state, dist = cpg_ext.extract( chunk_pos, cpg_table.pos.values, cpg_table.value.values ) #extract the cpg distance and state with wlen nan = np.isnan(state) state[ nan] = dat.CPG_NAN #set nan value as -1, which means unknown dist[nan] = dat.CPG_NAN # States can be binary (np.int8) or continuous # (np.float32). state = state.astype(cpg_table.value.dtype, copy=False) #set data type dist = dist.astype(np.float32, copy=False) assert len(state) == len(chunk_pos) assert len(dist) == len(chunk_pos) assert np.all((dist > 0) | (dist == dat.CPG_NAN)) group = context_group.create_group(name) group.create_dataset('state', data=state, compression='gzip') group.create_dataset('dist', data=dist, compression='gzip') #list(group) = ['state','dist'] if win_stats_meta is not None and opts.cpg_wlen: log.info('Computing window-based statistics ...') states = [] dists = [] cpg_states = [] cpg_group = out_group['cpg'] context_group = in_group['cpg'] for output_name in six.iterkeys(cpg_group): state = context_group[output_name]['state'].value states.append(np.expand_dims(state, 2)) dist = context_group[output_name]['dist'].value dists.append(np.expand_dims(dist, 2)) cpg_states.append(cpg_group[output_name].value) # samples x outputs x cpg_wlen states = np.swapaxes(np.concatenate(states, axis=2), 1, 2) dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2) cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2) cpg_dists = np.zeros_like(cpg_states) states = np.concatenate([states, cpg_states], axis=2) dists = np.concatenate([dists, cpg_dists], axis=2) for wlen in opts.win_stats_wlen: idx = (states == dat.CPG_NAN) | (dists > wlen // 2) states_wlen = np.ma.masked_array(states, idx) group = out_group.create_group('win_stats/%d' % wlen) for name, fun in six.iteritems(win_stats_meta): stat = fun[0](states_wlen) if hasattr(stat, 'mask'): idx = stat.mask stat = stat.data if np.sum(idx): stat[idx] = dat.CPG_NAN group.create_dataset(name, data=stat, dtype=fun[1], compression='gzip') if annos: log.info('Adding annotations ...') group = in_group.create_group('annos') for name, anno in six.iteritems(annos): group.create_dataset(name, data=anno[chunk_idx], dtype='int8', compression='gzip') chunk_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) # Check input arguments if not (opts.cpg_profiles or opts.bulk_profiles): if not (opts.pos_file or opts.dna_files): raise ValueError('Position table and DNA database expected!') if opts.dna_wlen and opts.dna_wlen % 2 == 0: raise '--dna_wlen must be odd!' if opts.cpg_wlen and opts.cpg_wlen % 2 != 0: raise '--cpg_wlen must be even!' # Parse functions for computing output statistics cpg_stats_meta = None win_stats_meta = None if opts.stats: cpg_stats_meta = get_stats_meta(opts.stats) if opts.win_stats: win_stats_meta = get_stats_meta(opts.win_stats) make_dir(opts.out_dir) outputs = OrderedDict() # Read single-cell profiles if provided if opts.cpg_profiles: log.info('Reading single-cell profiles ...') outputs['cpg'] = read_cpg_profiles(opts.cpg_profiles, chromos=opts.chromos, nb_sample=opts.nb_sample) if opts.bulk_profiles: log.info('Reading bulk profiles ...') outputs['bulk'] = read_cpg_profiles(opts.bulk_profiles, chromos=opts.chromos, nb_sample=opts.nb_sample, round=False) # Create table with unique positions if opts.pos_file: # Read positions from file log.info('Reading position table ...') pos_table = pd.read_table(opts.pos_file, usecols=[0, 1], dtype={ 0: str, 1: np.int32 }, header=None, comment='#') pos_table.columns = ['chromo', 'pos'] pos_table['chromo'] = dat.format_chromo(pos_table['chromo']) pos_table = prepro_pos_table(pos_table) else: # Extract positions from profiles pos_tables = [] for cpg_table in list(outputs['cpg'].values()): pos_tables.append(cpg_table[['chromo', 'pos']]) pos_table = prepro_pos_table(pos_tables) if opts.chromos: pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)] if opts.nb_sample: pos_table = pos_table.iloc[:opts.nb_sample] log.info('%d samples' % len(pos_table)) make_dir(opts.out_dir) # Iterate over chromosomes # ------------------------ for chromo in pos_table.chromo.unique(): log.info('-' * 80) log.info('Chromosome %s ...' % (chromo)) idx = pos_table.chromo == chromo chromo_pos = pos_table.loc[idx].pos.values chromo_outputs = OrderedDict() if 'cpg' in outputs: # Concatenate CpG tables into single nb_site x nb_output matrix chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo, chromo_pos) chromo_outputs['cpg_mat'] = np.vstack( list(chromo_outputs['cpg'].values())).T assert len(chromo_outputs['cpg_mat']) == len(chromo_pos) if 'bulk' in outputs: # Concatenate CpG tables into single nb_site x nb_output matrix chromo_outputs['bulk'] = map_cpg_tables( outputs['bulk'], chromo, chromo_pos) if 'cpg_mat' in chromo_outputs and opts.cpg_cov: cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1) assert np.all(cov >= 1) idx = cov >= opts.cpg_cov tmp = '%s sites matched minimum coverage filter' tmp %= format_out_of(idx.sum(), len(idx)) log.info(tmp) if idx.sum() == 0: continue chromo_pos = chromo_pos[idx] chromo_outputs = select_dict(chromo_outputs, idx) # Read DNA of chromosome chromo_dna = None if opts.dna_files: chromo_dna = fasta.read_chromo(opts.dna_files, chromo) annos = None if opts.anno_files: log.info('Annotating CpG sites ...') annos = dict() for anno_file in opts.anno_files: name = split_ext(anno_file) annos[name] = annotate(anno_file, chromo, chromo_pos) # Iterate over chunks # ------------------- nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size)) for chunk in range(nb_chunk): log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk)) chunk_start = chunk * opts.chunk_size chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size) chunk_idx = slice(chunk_start, chunk_end) chunk_pos = chromo_pos[chunk_idx] chunk_outputs = select_dict(chromo_outputs, chunk_idx) filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start, chunk_end) filename = os.path.join(opts.out_dir, filename) chunk_file = h5.File(filename, 'w') # Write positions chunk_file.create_dataset('chromo', shape=(len(chunk_pos), ), dtype='S2') chunk_file['chromo'][:] = chromo.encode() chunk_file.create_dataset('pos', data=chunk_pos, dtype=np.int32) if len(chunk_outputs): out_group = chunk_file.create_group('outputs') # Write cpg profiles if 'cpg' in chunk_outputs: for name, value in chunk_outputs['cpg'].items(): assert len(value) == len(chunk_pos) out_group.create_dataset('cpg/%s' % name, data=value, dtype=np.int8, compression='gzip') # Compute and write statistics if cpg_stats_meta is not None: log.info('Computing per CpG statistics ...') cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'], dat.CPG_NAN) mask = np.sum(~cpg_mat.mask, axis=1) mask = mask < opts.stats_cov for name, fun in cpg_stats_meta.items(): stat = fun[0](cpg_mat).data.astype(fun[1]) stat[mask] = dat.CPG_NAN assert len(stat) == len(chunk_pos) out_group.create_dataset('stats/%s' % name, data=stat, dtype=fun[1], compression='gzip') # Write bulk profiles if 'bulk' in chunk_outputs: for name, value in chunk_outputs['bulk'].items(): assert len(value) == len(chunk_pos) out_group.create_dataset('bulk/%s' % name, data=value, dtype=np.float32, compression='gzip') # Write input features in_group = chunk_file.create_group('inputs') # DNA windows if chromo_dna: log.info('Extracting DNA sequence windows ...') dna_wins = extract_seq_windows(chromo_dna, pos=chunk_pos, wlen=opts.dna_wlen) assert len(dna_wins) == len(chunk_pos) in_group.create_dataset('dna', data=dna_wins, dtype=np.int8, compression='gzip') # CpG neighbors if opts.cpg_wlen: log.info('Extracting CpG neighbors ...') cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2) context_group = in_group.create_group('cpg') # outputs['cpg'], since neighboring CpG sites might lie # outside chunk borders and un-mapped values are needed for name, cpg_table in outputs['cpg'].items(): cpg_table = cpg_table.loc[cpg_table.chromo == chromo] state, dist = cpg_ext.extract(chunk_pos, cpg_table.pos.values, cpg_table.value.values) nan = np.isnan(state) state[nan] = dat.CPG_NAN dist[nan] = dat.CPG_NAN state = state.astype(np.int8, copy=False) dist = dist.astype(np.float32, copy=False) assert len(state) == len(chunk_pos) assert np.all((state == 0) | (state == 1) | (state == dat.CPG_NAN)) assert len(dist) == len(chunk_pos) assert np.all((dist > 0) | (dist == dat.CPG_NAN)) group = context_group.create_group(name) group.create_dataset('state', data=state, compression='gzip') group.create_dataset('dist', data=dist, compression='gzip') if win_stats_meta is not None and opts.cpg_wlen: log.info('Computing window-based statistics ...') states = [] dists = [] cpg_states = [] cpg_group = out_group['cpg'] context_group = in_group['cpg'] for output_name in cpg_group.keys(): state = context_group[output_name]['state'].value states.append(np.expand_dims(state, 2)) dist = context_group[output_name]['dist'].value dists.append(np.expand_dims(dist, 2)) cpg_states.append(cpg_group[output_name].value) # samples x outputs x cpg_wlen states = np.swapaxes(np.concatenate(states, axis=2), 1, 2) dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2) cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2) cpg_dists = np.zeros_like(cpg_states) states = np.concatenate([states, cpg_states], axis=2) dists = np.concatenate([dists, cpg_dists], axis=2) for wlen in opts.win_stats_wlen: idx = (states == dat.CPG_NAN) | (dists > wlen // 2) states_wlen = np.ma.masked_array(states, idx) group = out_group.create_group('win_stats/%d' % wlen) for name, fun in win_stats_meta.items(): stat = fun[0](states_wlen) if hasattr(stat, 'mask'): idx = stat.mask stat = stat.data if np.sum(idx): stat[idx] = dat.CPG_NAN group.create_dataset(name, data=stat, dtype=fun[1], compression='gzip') if annos: log.info('Adding annotations ...') group = in_group.create_group('annos') for name, anno in annos.items(): group.create_dataset(name, data=anno[chunk_idx], dtype='int8', compression='gzip') chunk_file.close() log.info('Done!') return 0
def main(self, name, opts): if opts.seed is not None: np.random.seed(opts.seed) logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) # Check input arguments if not opts.cpg_profiles: if not (opts.pos_file or opts.dna_files): raise ValueError('Position table and DNA database expected!') if opts.dna_wlen and opts.dna_wlen % 2 == 0: raise '--dna_wlen must be odd!' if opts.cpg_wlen and opts.cpg_wlen % 2 != 0: raise '--cpg_wlen must be even!' # Parse functions for computing output statistics cpg_stats_meta = None win_stats_meta = None if opts.cpg_stats: cpg_stats_meta = get_stats_meta(opts.cpg_stats) if opts.win_stats: win_stats_meta = get_stats_meta(opts.win_stats) make_dir(opts.out_dir) outputs = OrderedDict() # Read single-cell profiles if provided if opts.cpg_profiles: log.info('Reading CpG profiles ...') outputs['cpg'] = read_cpg_profiles( opts.cpg_profiles, chromos=opts.chromos, nb_sample=opts.nb_sample, nb_sample_chromo=opts.nb_sample_chromo, log=log.info) # Create table with unique positions if opts.pos_file: # Read positions from file log.info('Reading position table ...') pos_table = pd.read_table(opts.pos_file, usecols=[0, 1], dtype={0: str, 1: np.int32}, header=None, comment='#') pos_table.columns = ['chromo', 'pos'] pos_table['chromo'] = dat.format_chromo(pos_table['chromo']) pos_table = prepro_pos_table(pos_table) else: # Extract positions from profiles pos_tables = [] for cpg_table in list(outputs['cpg'].values()): pos_tables.append(cpg_table[['chromo', 'pos']]) pos_table = prepro_pos_table(pos_tables) if opts.chromos: pos_table = pos_table.loc[pos_table.chromo.isin(opts.chromos)] if opts.nb_sample_chromo: pos_table = dat.sample_from_chromo(pos_table, opts.nb_sample_chromo) if opts.nb_sample: pos_table = pos_table.iloc[:opts.nb_sample] log.info('%d samples' % len(pos_table)) make_dir(opts.out_dir) # Iterate over chromosomes # ------------------------ for chromo in pos_table.chromo.unique(): log.info('-' * 80) log.info('Chromosome %s ...' % (chromo)) idx = pos_table.chromo == chromo chromo_pos = pos_table.loc[idx].pos.values chromo_outputs = OrderedDict() if 'cpg' in outputs: # Concatenate CpG tables into single nb_site x nb_output matrix chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo, chromo_pos) chromo_outputs['cpg_mat'] = np.vstack( list(chromo_outputs['cpg'].values())).T assert len(chromo_outputs['cpg_mat']) == len(chromo_pos) if 'cpg_mat' in chromo_outputs and opts.cpg_cov: cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1) assert np.all(cov >= 1) idx = cov >= opts.cpg_cov tmp = '%s sites matched minimum coverage filter' tmp %= format_out_of(idx.sum(), len(idx)) log.info(tmp) if idx.sum() == 0: continue chromo_pos = chromo_pos[idx] chromo_outputs = select_dict(chromo_outputs, idx) # Read DNA of chromosome chromo_dna = None if opts.dna_files: chromo_dna = fasta.read_chromo(opts.dna_files, chromo) annos = None if opts.anno_files: log.info('Annotating CpG sites ...') annos = dict() for anno_file in opts.anno_files: name = split_ext(anno_file) annos[name] = annotate(anno_file, chromo, chromo_pos) # Iterate over chunks # ------------------- nb_chunk = int(np.ceil(len(chromo_pos) / opts.chunk_size)) for chunk in range(nb_chunk): log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk)) chunk_start = chunk * opts.chunk_size chunk_end = min(len(chromo_pos), chunk_start + opts.chunk_size) chunk_idx = slice(chunk_start, chunk_end) chunk_pos = chromo_pos[chunk_idx] chunk_outputs = select_dict(chromo_outputs, chunk_idx) filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start, chunk_end) filename = os.path.join(opts.out_dir, filename) chunk_file = h5.File(filename, 'w') # Write positions chunk_file.create_dataset('chromo', shape=(len(chunk_pos),), dtype='S2') chunk_file['chromo'][:] = chromo.encode() chunk_file.create_dataset('pos', data=chunk_pos, dtype=np.int32) if len(chunk_outputs): out_group = chunk_file.create_group('outputs') # Write cpg profiles if 'cpg' in chunk_outputs: for name, value in six.iteritems(chunk_outputs['cpg']): assert len(value) == len(chunk_pos) # Round continuous values out_group.create_dataset('cpg/%s' % name, data=value.round(), dtype=np.int8, compression='gzip') # Compute and write statistics if cpg_stats_meta is not None: log.info('Computing per CpG statistics ...') cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'], dat.CPG_NAN) mask = np.sum(~cpg_mat.mask, axis=1) mask = mask < opts.cpg_stats_cov for name, fun in six.iteritems(cpg_stats_meta): stat = fun[0](cpg_mat).data.astype(fun[1]) stat[mask] = dat.CPG_NAN assert len(stat) == len(chunk_pos) out_group.create_dataset('cpg_stats/%s' % name, data=stat, dtype=fun[1], compression='gzip') # Write input features in_group = chunk_file.create_group('inputs') # DNA windows if chromo_dna: log.info('Extracting DNA sequence windows ...') dna_wins = extract_seq_windows(chromo_dna, pos=chunk_pos, wlen=opts.dna_wlen) assert len(dna_wins) == len(chunk_pos) in_group.create_dataset('dna', data=dna_wins, dtype=np.int8, compression='gzip') # CpG neighbors if opts.cpg_wlen: log.info('Extracting CpG neighbors ...') cpg_ext = fext.KnnCpgFeatureExtractor(opts.cpg_wlen // 2) context_group = in_group.create_group('cpg') # outputs['cpg'], since neighboring CpG sites might lie # outside chunk borders and un-mapped values are needed for name, cpg_table in six.iteritems(outputs['cpg']): cpg_table = cpg_table.loc[cpg_table.chromo == chromo] state, dist = cpg_ext.extract(chunk_pos, cpg_table.pos.values, cpg_table.value.values) nan = np.isnan(state) state[nan] = dat.CPG_NAN dist[nan] = dat.CPG_NAN # States can be binary (np.int8) or continuous # (np.float32). state = state.astype(cpg_table.value.dtype, copy=False) dist = dist.astype(np.float32, copy=False) assert len(state) == len(chunk_pos) assert len(dist) == len(chunk_pos) assert np.all((dist > 0) | (dist == dat.CPG_NAN)) group = context_group.create_group(name) group.create_dataset('state', data=state, compression='gzip') group.create_dataset('dist', data=dist, compression='gzip') if win_stats_meta is not None and opts.cpg_wlen: log.info('Computing window-based statistics ...') states = [] dists = [] cpg_states = [] cpg_group = out_group['cpg'] context_group = in_group['cpg'] for output_name in six.iterkeys(cpg_group): state = context_group[output_name]['state'].value states.append(np.expand_dims(state, 2)) dist = context_group[output_name]['dist'].value dists.append(np.expand_dims(dist, 2)) cpg_states.append(cpg_group[output_name].value) # samples x outputs x cpg_wlen states = np.swapaxes(np.concatenate(states, axis=2), 1, 2) dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2) cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2) cpg_dists = np.zeros_like(cpg_states) states = np.concatenate([states, cpg_states], axis=2) dists = np.concatenate([dists, cpg_dists], axis=2) for wlen in opts.win_stats_wlen: idx = (states == dat.CPG_NAN) | (dists > wlen // 2) states_wlen = np.ma.masked_array(states, idx) group = out_group.create_group('win_stats/%d' % wlen) for name, fun in six.iteritems(win_stats_meta): stat = fun[0](states_wlen) if hasattr(stat, 'mask'): idx = stat.mask stat = stat.data if np.sum(idx): stat[idx] = dat.CPG_NAN group.create_dataset(name, data=stat, dtype=fun[1], compression='gzip') if annos: log.info('Adding annotations ...') group = in_group.create_group('annos') for name, anno in six.iteritems(annos): group.create_dataset(name, data=anno[chunk_idx], dtype='int8', compression='gzip') chunk_file.close() log.info('Done!') return 0
def run_dcpg_data(pos_file = None, cpg_profiles = None, dna_files = None, cpg_wlen=None, cpg_cov = 1, dna_wlen=1001, anno_files=None, chromos = None, nb_sample = None, nb_sample_chromo = None, chunk_size = 32768, seed = 0, verbose = False): if seed is not None: np.random.seed(seed) # FIXME name = "dcpg_data" logging.basicConfig(format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) # Check input arguments if not cpg_profiles: if not (pos_file or dna_files): raise ValueError('Position table and DNA database expected!') if dna_wlen and dna_wlen % 2 == 0: raise 'dna_wlen must be odd!' if cpg_wlen and cpg_wlen % 2 != 0: raise 'cpg_wlen must be even!' """ # Parse functions for computing output statistics cpg_stats_meta = None win_stats_meta = None if cpg_stats: cpg_stats_meta = get_stats_meta(cpg_stats) if win_stats: win_stats_meta = get_stats_meta(win_stats) """ outputs = OrderedDict() # Read single-cell profiles if provided if cpg_profiles: log.info('Reading CpG profiles ...') outputs['cpg'] = read_cpg_profiles( cpg_profiles, chromos=chromos, nb_sample=nb_sample, nb_sample_chromo=nb_sample_chromo, log=log.info) # Create table with unique positions if pos_file: # Read positions from file log.info('Reading position table ...') pos_table = pd.read_table(pos_file, usecols=[0, 1], dtype={0: str, 1: np.int32}, header=None, comment='#') pos_table.columns = ['chromo', 'pos'] pos_table['chromo'] = dat.format_chromo(pos_table['chromo']) pos_table = prepro_pos_table(pos_table) else: # Extract positions from profiles pos_tables = [] for cpg_table in list(outputs['cpg'].values()): pos_tables.append(cpg_table[['chromo', 'pos']]) pos_table = prepro_pos_table(pos_tables) if chromos: pos_table = pos_table.loc[pos_table.chromo.isin(chromos)] if nb_sample_chromo: pos_table = dat.sample_from_chromo(pos_table, nb_sample_chromo) if nb_sample: pos_table = pos_table.iloc[:nb_sample] log.info('%d samples' % len(pos_table)) # Iterate over chromosomes # ------------------------ for chromo in pos_table.chromo.unique(): log.info('-' * 80) log.info('Chromosome %s ...' % (chromo)) idx = pos_table.chromo == chromo chromo_pos = pos_table.loc[idx].pos.values chromo_outputs = OrderedDict() if 'cpg' in outputs: # Concatenate CpG tables into single nb_site x nb_output matrix chromo_outputs['cpg'] = map_cpg_tables(outputs['cpg'], chromo, chromo_pos) chromo_outputs['cpg_mat'] = np.vstack( list(chromo_outputs['cpg'].values())).T assert len(chromo_outputs['cpg_mat']) == len(chromo_pos) if 'cpg_mat' in chromo_outputs and cpg_cov: cov = np.sum(chromo_outputs['cpg_mat'] != dat.CPG_NAN, axis=1) assert np.all(cov >= 1) idx = cov >= cpg_cov tmp = '%s sites matched minimum coverage filter' tmp %= format_out_of(idx.sum(), len(idx)) log.info(tmp) if idx.sum() == 0: continue chromo_pos = chromo_pos[idx] chromo_outputs = select_dict(chromo_outputs, idx) # Read DNA of chromosome chromo_dna = None if dna_files: chromo_dna = fasta.read_chromo(dna_files, chromo) annos = None if anno_files: log.info('Annotating CpG sites ...') annos = dict() for anno_file in anno_files: name = split_ext(anno_file) annos[name] = annotate(anno_file, chromo, chromo_pos) # Iterate over chunks # ------------------- nb_chunk = int(np.ceil(len(chromo_pos) / chunk_size)) for chunk in range(nb_chunk): log.info('Chunk \t%d / %d' % (chunk + 1, nb_chunk)) chunk_start = chunk * chunk_size chunk_end = min(len(chromo_pos), chunk_start + chunk_size) chunk_idx = slice(chunk_start, chunk_end) chunk_pos = chromo_pos[chunk_idx] chunk_outputs = select_dict(chromo_outputs, chunk_idx) #filename = 'c%s_%06d-%06d.h5' % (chromo, chunk_start, chunk_end) #filename = os.path.join(out_dir, filename) #chunk_file = h5.File(filename, 'w') # Write positions #chunk_file.create_dataset('chromo', shape=(len(chunk_pos),), # dtype='S2') #chunk_file['chromo'][:] = chromo.encode() #chunk_file.create_dataset('pos', data=chunk_pos, dtype=np.int32) yield_dict = {} yield_dict["chromo"] = np.array([chromo.encode()]*len(chunk_pos), dtype='S2') yield_dict["pos"] = np.array(chunk_pos, dtype=np.int32) if len(chunk_outputs): #out_group = chunk_file.create_group('outputs') yield_dict["outputs"] = {} out_group = yield_dict["outputs"] # Write cpg profiles if 'cpg' in chunk_outputs: yield_dict["outputs"]['cpg']={} for name, value in six.iteritems(chunk_outputs['cpg']): assert len(value) == len(chunk_pos) # Round continuous values #out_group.create_dataset('cpg/%s' % name, # data=value.round(), # dtype=np.int8, # compression='gzip') out_group['cpg'][name] = np.array(value.round(), np.int8) """ # Compute and write statistics if cpg_stats_meta is not None: log.info('Computing per CpG statistics ...') cpg_mat = np.ma.masked_values(chunk_outputs['cpg_mat'], dat.CPG_NAN) mask = np.sum(~cpg_mat.mask, axis=1) mask = mask < cpg_stats_cov for name, fun in six.iteritems(cpg_stats_meta): stat = fun[0](cpg_mat).data.astype(fun[1]) stat[mask] = dat.CPG_NAN assert len(stat) == len(chunk_pos) out_group.create_dataset('cpg_stats/%s' % name, data=stat, dtype=fun[1], compression='gzip') """ # Write input features #in_group = chunk_file.create_group('inputs') yield_dict["inputs"] = {} in_group = yield_dict["inputs"] # DNA windows if chromo_dna: log.info('Extracting DNA sequence windows ...') dna_wins = extract_seq_windows(chromo_dna, pos=chunk_pos, wlen=dna_wlen) assert len(dna_wins) == len(chunk_pos) #in_group.create_dataset('dna', data=dna_wins, dtype=np.int8, # compression='gzip') in_group['dna'] = np.array(dna_wins, dtype=np.int8) # CpG neighbors if cpg_wlen: log.info('Extracting CpG neighbors ...') cpg_ext = fext.KnnCpgFeatureExtractor(cpg_wlen // 2) #context_group = in_group.create_group('cpg') in_group['cpg'] = {} context_group = in_group['cpg'] # outputs['cpg'], since neighboring CpG sites might lie # outside chunk borders and un-mapped values are needed for name, cpg_table in six.iteritems(outputs['cpg']): cpg_table = cpg_table.loc[cpg_table.chromo == chromo] state, dist = cpg_ext.extract(chunk_pos, cpg_table.pos.values, cpg_table.value.values) nan = np.isnan(state) state[nan] = dat.CPG_NAN dist[nan] = dat.CPG_NAN # States can be binary (np.int8) or continuous # (np.float32). state = state.astype(cpg_table.value.dtype, copy=False) dist = dist.astype(np.float32, copy=False) assert len(state) == len(chunk_pos) assert len(dist) == len(chunk_pos) assert np.all((dist > 0) | (dist == dat.CPG_NAN)) #group = context_group.create_group(name) #group.create_dataset('state', data=state, # compression='gzip') #group.create_dataset('dist', data=dist, # compression='gzip') context_group[name] = {'state': state, 'dist':dist} """ if win_stats_meta is not None and cpg_wlen: log.info('Computing window-based statistics ...') states = [] dists = [] cpg_states = [] cpg_group = out_group['cpg'] context_group = in_group['cpg'] for output_name in six.iterkeys(cpg_group): state = context_group[output_name]['state']#.value states.append(np.expand_dims(state, 2)) dist = context_group[output_name]['dist']#.value dists.append(np.expand_dims(dist, 2)) #cpg_states.append(cpg_group[output_name].value) cpg_states.append(cpg_group[output_name]) # samples x outputs x cpg_wlen states = np.swapaxes(np.concatenate(states, axis=2), 1, 2) dists = np.swapaxes(np.concatenate(dists, axis=2), 1, 2) cpg_states = np.expand_dims(np.vstack(cpg_states).T, 2) cpg_dists = np.zeros_like(cpg_states) states = np.concatenate([states, cpg_states], axis=2) dists = np.concatenate([dists, cpg_dists], axis=2) for wlen in win_stats_wlen: idx = (states == dat.CPG_NAN) | (dists > wlen // 2) states_wlen = np.ma.masked_array(states, idx) group = out_group.create_group('win_stats/%d' % wlen) for name, fun in six.iteritems(win_stats_meta): stat = fun[0](states_wlen) if hasattr(stat, 'mask'): idx = stat.mask stat = stat.data if np.sum(idx): stat[idx] = dat.CPG_NAN group.create_dataset(name, data=stat, dtype=fun[1], compression='gzip') if annos: log.info('Adding annotations ...') group = in_group.create_group('annos') for name, anno in six.iteritems(annos): group.create_dataset(name, data=anno[chunk_idx], dtype='int8', compression='gzip') """ #chunk_file.close() flat_dict={} flatten_dict(yield_dict, flat_dict, no_prefix = True) yield flat_dict log.info('Done preprocessing!')