def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) log.info('Reading data') in_file = h5.File(opts.in_file, 'r') nb_sample = in_file['/act'].shape[0] if opts.nb_sample: nb_sample = min(opts.nb_sample, nb_sample) nb_filter = in_file['/act'].shape[-1] filters_idx = opts.filters if filters_idx is None: filters_idx = range(nb_filter) else: filters_idx = ranges_to_list(filters_idx, 0, nb_filter - 1) nb_filter = len(filters_idx) # Get only view on data to reduce memory usage. Possible since filters # can be processed independently. filters_act = in_file['/act'] seqs = in_file['/inputs/dna'][:nb_sample] if seqs.shape[1] != filters_act.shape[1]: # Trim sequence length to length of activation layer tmp = (seqs.shape[1] - filters_act.shape[1]) // 2 seqs = seqs[:, tmp:(tmp + filters_act.shape[1])] assert seqs.shape[1] == filters_act.shape[1] filters_weights = in_file['weights/weights'] if filters_weights.ndim == 4: # For backward compatibility, support filter weights of shape # [filter_len, 1, nb_input_features, nb_output_features] assert filters_weights.shape[1] == 1 filters_weights = filters_weights[:, 0, :] # The number of input features must match the number of nucleotides. assert filters_weights.shape[1] == 4 filter_len = len(filters_weights) print('Filters: %d' % nb_filter) print('Filter len: %d' % filter_len) print('Samples: %d' % nb_sample) # Create output directories make_dir(opts.out_dir) sub_dirs = dict() names = ['logos', 'fa'] if opts.plot_dens: names.append('dens') if opts.plot_heat: names.append('heat') if opts.motif_dbs: names.append('tomtom') for name in names: dirname = pt.join(opts.out_dir, name) sub_dirs[name] = dirname make_dir(dirname) meme_filename = pt.join(opts.out_dir, 'meme.txt') meme_file = open_meme(meme_filename, seqs) if opts.plot_pca: tmp = min(len(filters_act), opts.nb_sample_pca) log.info('Performing PCA on activations using %d samples' % tmp) # Down-sample activations to at most nb_sample_pca samples to reduce # memory usage and run-time. pca_act = filters_act[:tmp, :, filters_idx] act = pca_act.mean(axis=1) tmp = self.plot_filename(opts.out_dir, 'pca_mean') plot_pca(act, labels=filters_idx, filename=tmp) weights = linear_weights(pca_act.shape[1]) act = np.average(pca_act, 1, weights) tmp = self.plot_filename(opts.out_dir, 'pca_wmean') plot_pca(act, labels=filters_idx, filename=tmp) act = pca_act.max(axis=1) tmp = self.plot_filename(opts.out_dir, 'pca_max') plot_pca(act, labels=filters_idx, filename=tmp) log.info('Analyzing filters') log.info('-----------------') filter_stats = [] weblogo_opts = WEBLOGO_OPTS if opts.weblogo_opts: weblogo_opts = opts.weblogo_opts for idx in filters_idx: log.info('Filter %d' % idx) filter_act = filters_act[:nb_sample, :, idx] filter_weights = filters_weights[:, :, idx].T assert len(filter_weights) == len(ALPHABET) stats = OrderedDict() stats['idx'] = idx stats['motif'] = get_motif_from_weights(filter_weights) stats['act_mean'] = filter_act.mean() stats['act_std'] = filter_act.std() stats['ic'] = 0 stats['nb_site'] = 0 stats = pd.Series(stats) filter_stats.append(stats) if stats['act_mean'] == 0: log.info('Dead filter -> skip') continue if opts.plot_dens: log.info('Plotting filter densities') tmp = self.plot_filename(sub_dirs['dens'], '%03d' % idx) plot_filter_densities(np.ravel(filter_act), tmp) if opts.plot_heat: log.info('Plotting filter heatmap') tmp = self.plot_filename(sub_dirs['heat'], '%03d' % idx) plot_filter_heatmap(filter_weights, tmp) log.info('Extracting activating kmers') act_kmers = get_act_kmers(filter_act, filter_len, seqs, thr_per=opts.act_thr_per, thr_max=opts.act_thr_max) stats.nb_site = len(act_kmers) if len(act_kmers) < 10: log.info('Only %d activating kmers -> skip' % len(act_kmers)) continue log.info('Plotting sequence logo') logo_file = pt.join(sub_dirs['fa'], '%03d.fa' % idx) write_kmers(act_kmers, logo_file) plot_logo(logo_file, self.plot_filename(sub_dirs['logos'], '%03d' % idx), options=weblogo_opts) if opts.delete_fasta: os.remove(logo_file) log.info('Computing PWM') pwm = get_pwm(act_kmers) stats.ic = info_content(pwm) add_to_meme(meme_file, idx, pwm, len(act_kmers), trim_thr=opts.trim_thr) meme_file.close() filter_stats = pd.DataFrame(filter_stats) for name in ['idx', 'nb_site']: filter_stats[name] = filter_stats[name].astype(np.int32) filter_stats.sort_values('act_mean', ascending=False, inplace=True) print() print('\nFilter statistics:') print(filter_stats.to_string()) filter_stats.to_csv(pt.join(opts.out_dir, 'stats.tsv'), float_format='%.4f', sep='\t', index=False) if opts.motif_dbs: log.info('Running tomtom') cmd = 'tomtom -dist pearson -thresh {thr} -oc {out_dir} ' + \ '{meme_file} {motif_dbs}' cmd = cmd.format(thr=opts.fdr, out_dir=pt.join(opts.out_dir, 'tomtom'), meme_file=meme_filename, motif_dbs=' '.join(opts.motif_dbs)) print('\n', cmd) subprocess.call(cmd, shell=True) meme_motifs = [] for motif_db in opts.motif_dbs: meme_motifs.append(read_meme_db(motif_db)) meme_motifs = pd.concat(meme_motifs) tmp = pt.join(opts.out_dir, 'tomtom', 'meme_motifs.tsv') meme_motifs.to_csv(tmp, sep='\t', index=False) report = get_report(pt.join(opts.out_dir, 'stats.tsv'), pt.join(opts.out_dir, 'tomtom', 'tomtom.txt'), meme_motifs) report.sort_values(['idx', 'q-value', 'act_mean'], ascending=[True, True, False], inplace=True) report.to_csv(pt.join(opts.out_dir, 'report.tsv'), index=False, sep='\t', float_format='%.3f') report_top = report.groupby('idx').first().reset_index() report_top.sort_values(['q-value', 'act_mean'], ascending=[True, False], inplace=True) report_top.index = range(len(report_top)) report_top.to_csv(pt.join(opts.out_dir, 'report_top.tsv'), index=False, sep='\t', float_format='%.3f') print('\nTomtom results:') print(report_top.to_string()) in_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files, log=log.info) weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True) log.info('Using activation layer "%s"' % act_layer.name) log.info('Using weight layer "%s"' % weight_layer.name) try: dna_idx = model.input_names.index('dna') except BaseException: raise IOError('Model is not a valid DNA model!') fun_outputs = to_list(act_layer.output) if opts.store_preds: fun_outputs += to_list(model.output) fun = K.function([to_list(model.input)[dna_idx]], fun_outputs) log.info('Reading data ...') if opts.store_outputs or opts.store_preds: output_names = model.output_names else: output_names = None data_reader = mod.DataReader( output_names=output_names, use_dna=True, dna_wlen=to_list(model.input_shape)[dna_idx][1] ) nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=opts.shuffle) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file weights = weight_layer.get_weights() out_group['weights/weights'] = weights[0] out_group['weights/bias'] = weights[1] def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset( name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression ) out_group[path][idx:idx+len(data)] = data log.info('Computing activations') progbar = ProgressBar(nb_sample, log.info) idx = 0 for data in data_reader: if isinstance(data, tuple): inputs, outputs, weights = data else: inputs = data if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) if opts.store_inputs: for i, name in enumerate(model.input_names): h5_dump('inputs/%s' % name, dna.onehot_to_int(inputs[i]), idx) if opts.store_outputs: for name, output in six.iteritems(outputs): h5_dump('outputs/%s' % name, output, idx) fun_eval = fun(inputs) act = fun_eval[0] if opts.act_wlen: delta = opts.act_wlen // 2 ctr = act.shape[1] // 2 act = act[:, (ctr-delta):(ctr+delta+1)] if opts.act_fun: if opts.act_fun == 'mean': act = act.mean(axis=1) elif opts.act_fun == 'wmean': weights = linear_weights(act.shape[1]) act = np.average(act, axis=1, weights=weights) elif opts.act_fun == 'max': act = act.max(axis=1) else: raise ValueError('Invalid function "%s"!' % (opts.act_fun)) h5_dump('act', act, idx) if opts.store_preds: preds = fun_eval[1:] for i, name in enumerate(model.output_names): h5_dump('preds/%s' % name, preds[i].squeeze(), idx) for name, value in six.iteritems(next(meta_reader)): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files, log=log.info) weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True) log.info('Using activation layer "%s"' % act_layer.name) log.info('Using weight layer "%s"' % weight_layer.name) try: dna_idx = model.input_names.index('dna') except BaseException: raise IOError('Model is not a valid DNA model!') fun_outputs = to_list(act_layer.output) if opts.store_preds: fun_outputs += to_list(model.output) fun = K.function([to_list(model.input)[dna_idx]], fun_outputs) log.info('Reading data ...') if opts.store_outputs or opts.store_preds: output_names = model.output_names else: output_names = None data_reader = mod.DataReader(output_names=output_names, use_dna=True, dna_wlen=to_list( model.input_shape)[dna_idx][1]) nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file weights = weight_layer.get_weights() out_group['weights/weights'] = weights[0] out_group['weights/bias'] = weights[1] def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset(name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression) out_group[path][idx:idx + len(data)] = data log.info('Computing activations') progbar = ProgressBar(nb_sample, log.info) idx = 0 for data in data_reader: if isinstance(data, tuple): inputs, outputs, weights = data else: inputs = data if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) if opts.store_inputs: for i, name in enumerate(model.input_names): h5_dump('inputs/%s' % name, dna.onehot_to_int(inputs[i]), idx) if opts.store_outputs: for name, output in six.iteritems(outputs): h5_dump('outputs/%s' % name, output, idx) fun_eval = fun(inputs) act = fun_eval[0] if opts.act_wlen: delta = opts.act_wlen // 2 ctr = act.shape[1] // 2 act = act[:, (ctr - delta):(ctr + delta + 1)] if opts.act_fun: if opts.act_fun == 'mean': act = act.mean(axis=1) elif opts.act_fun == 'wmean': weights = linear_weights(act.shape[1]) act = np.average(act, axis=1, weights=weights) elif opts.act_fun == 'max': act = act.max(axis=1) else: raise ValueError('Invalid function "%s"!' % (opts.act_fun)) h5_dump('act', act, idx) if opts.store_preds: preds = fun_eval[1:] for i, name in enumerate(model.output_names): h5_dump('preds/%s' % name, preds[i].squeeze(), idx) for name, value in six.iteritems(next(meta_reader)): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files) # Get DNA layer. dna_layer = None for layer in model.layers: if layer.name == 'dna': dna_layer = layer break if not dna_layer: raise ValueError('The provided model is not a DNA model!') # Create output vector. outputs = [] for output in model.outputs: outputs.append(K.reshape(output, (-1, 1))) outputs = K.concatenate(outputs, axis=1) # Compute gradient of outputs wrt. DNA layer. grads = [] for name in opts.targets: if name == 'mean': target = K.mean(outputs, axis=1) elif name == 'var': target = K.var(outputs, axis=1) else: raise ValueError('Invalid effect size "%s"!' % name) grad = K.gradients(target, dna_layer.output) grads.extend(grad) grad_fun = K.function(model.inputs, grads) log.info('Reading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names(opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, outputs=False, replicate_names=replicate_names) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset(name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression) out_group[path][idx:idx + len(data)] = data log.info('Computing effects ...') progbar = ProgressBar(nb_sample, log.info) idx = 0 for inputs in data_reader: if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) # Compute gradients. grads = grad_fun(inputs) # Slice window at center. if opts.dna_wlen: for i, grad in enumerate(grads): delta = opts.dna_wlen // 2 ctr = grad.shape[1] // 2 grads[i] = grad[:, (ctr - delta):(ctr + delta + 1)] # Aggregate effects in window if opts.agg_effects: for i, grad in enumerate(grads): if opts.agg_effects == 'mean': grad = grad.mean(axis=1) elif opts.agg_effects == 'wmean': weights = linear_weights(grad.shape[1]) grad = np.average(grad, axis=1, weights=weights) elif opts.agg_effects == 'max': grad = grad.max(axis=1) else: tmp = 'Invalid function "%s"!' % (opts.agg_effects) raise ValueError(tmp) grads[i] = grad # Write computed effects for name, grad in zip(opts.targets, grads): h5_dump(name, grad, idx) # Store inputs if opts.store_inputs: for name, value in zip(model.input_names, inputs): h5_dump(name, value, idx) # Store positions for name, value in next(meta_reader).items(): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files) # Get DNA layer. dna_layer = None for layer in model.layers: if layer.name == 'dna': dna_layer = layer break if not dna_layer: raise ValueError('The provided model is not a DNA model!') # Create output vector. outputs = [] for output in model.outputs: outputs.append(K.reshape(output, (-1, 1))) outputs = K.concatenate(outputs, axis=1) # Compute gradient of outputs wrt. DNA layer. grads = [] for name in opts.targets: if name == 'mean': target = K.mean(outputs, axis=1) elif name == 'var': target = K.var(outputs, axis=1) else: raise ValueError('Invalid effect size "%s"!' % name) grad = K.gradients(target, dna_layer.output) grads.extend(grad) grad_fun = K.function(model.inputs, grads) log.info('Reading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names( opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, outputs=False, replicate_names=replicate_names) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset( name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression ) out_group[path][idx:idx+len(data)] = data log.info('Computing effects ...') progbar = ProgressBar(nb_sample, log.info) idx = 0 for inputs in data_reader: if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) # Compute gradients. grads = grad_fun(inputs) # Slice window at center. if opts.dna_wlen: for i, grad in enumerate(grads): delta = opts.dna_wlen // 2 ctr = grad.shape[1] // 2 grads[i] = grad[:, (ctr-delta):(ctr+delta+1)] # Aggregate effects in window if opts.agg_effects: for i, grad in enumerate(grads): if opts.agg_effects == 'mean': grad = grad.mean(axis=1) elif opts.agg_effects == 'wmean': weights = linear_weights(grad.shape[1]) grad = np.average(grad, axis=1, weights=weights) elif opts.agg_effects == 'max': grad = grad.max(axis=1) else: tmp = 'Invalid function "%s"!' % (opts.agg_effects) raise ValueError(tmp) grads[i] = grad # Write computed effects for name, grad in zip(opts.targets, grads): h5_dump(name, grad, idx) # Store inputs if opts.store_inputs: for name, value in zip(model.input_names, inputs): h5_dump(name, value, idx) # Store positions for name, value in next(meta_reader).items(): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0