def build_cpg_model(self): opts = self.opts log = self.log replicate_names = dat.get_replicate_names( opts.train_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) if not replicate_names: raise ValueError('No replicates found!') print('Replicate names:') print(', '.join(replicate_names)) print() cpg_wlen = dat.get_cpg_wlen(opts.train_files[0], opts.cpg_wlen) if os.path.exists(opts.cpg_model[0]): log.info('Loading existing CpG model ...') src_cpg_model = mod.load_model(opts.cpg_model, log=log.info) remove_outputs(src_cpg_model) rename_layers(src_cpg_model, 'cpg') nb_replicate = src_cpg_model.input_shape[0][1] if nb_replicate != len(replicate_names): tmp = 'CpG model was trained with %d replicates but %d' 'replicates provided. Copying weight to new model ...' tmp %= (nb_replicate, len(replicate_names)) log.info('Replicate names differ: ' 'Copying weights to new model ...') cpg_model_builder = mod.cpg.get(src_cpg_model.name)( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) cpg_inputs = cpg_model_builder.inputs(cpg_wlen, replicate_names) cpg_model = cpg_model_builder(cpg_inputs) mod.copy_weights(src_cpg_model, cpg_model) else: cpg_model = src_cpg_model else: log.info('Building CpG model ...') cpg_model_builder = mod.cpg.get(opts.cpg_model[0])( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) cpg_inputs = cpg_model_builder.inputs(cpg_wlen, replicate_names) cpg_model = cpg_model_builder(cpg_inputs) return cpg_model
def build_cpg_model(self): opts = self.opts log = self.log replicate_names = dat.get_replicate_names(opts.train_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) if not replicate_names: raise ValueError('No replicates found!') print('Replicate names:') print(', '.join(replicate_names)) print() cpg_wlen = dat.get_cpg_wlen(opts.train_files[0], opts.cpg_wlen) if os.path.exists(opts.cpg_model[0]): log.info('Loading existing CpG model ...') src_cpg_model = mod.load_model(opts.cpg_model, log=log.info) remove_outputs(src_cpg_model) rename_layers(src_cpg_model, 'cpg') nb_replicate = src_cpg_model.input_shape[0][1] if nb_replicate != len(replicate_names): tmp = 'CpG model was trained with %d replicates but %d' 'replicates provided. Copying weight to new model ...' tmp %= (nb_replicate, len(replicate_names)) log.info('Replicate names differ: ' 'Copying weights to new model ...') cpg_model_builder = mod.cpg.get(src_cpg_model.name)( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) cpg_inputs = cpg_model_builder.inputs(cpg_wlen, replicate_names) cpg_model = cpg_model_builder(cpg_inputs) mod.copy_weights(src_cpg_model, cpg_model) else: cpg_model = src_cpg_model else: log.info('Building CpG model ...') cpg_model_builder = mod.cpg.get(opts.cpg_model[0])( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) cpg_inputs = cpg_model_builder.inputs(cpg_wlen, replicate_names) cpg_model = cpg_model_builder(cpg_inputs) return cpg_model
def build_dna_model(self): opts = self.opts log = self.log if os.path.exists(opts.dna_model[0]): log.info('Loading existing DNA model ...') dna_model = mod.load_model(opts.dna_model, log=log.info) remove_outputs(dna_model) rename_layers(dna_model, 'dna') else: log.info('Building DNA model ...') dna_model_builder = mod.dna.get(opts.dna_model[0])( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) dna_wlen = dat.get_dna_wlen(opts.train_files[0], opts.dna_wlen) dna_inputs = dna_model_builder.inputs(dna_wlen) dna_model = dna_model_builder(dna_inputs) return dna_model
def build_dna_model(self): opts = self.opts log = self.log if os.path.exists(opts.dna_model[0]): log.info('Loading existing DNA model ...') dna_model = mod.load_model(opts.dna_model, log=log.info) remove_outputs(dna_model) rename_layers(dna_model, 'dna') else: log.info('Building DNA model ...') dna_model_builder = mod.dna.get(opts.dna_model[0])( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) dna_wlen = dat.get_dna_wlen(opts.train_files[0], opts.dna_wlen) dna_inputs = dna_model_builder.inputs(dna_wlen) dna_model = dna_model_builder(dna_inputs) return dna_model
def build_dna_model(self): opts = self.opts log = self.log if os.path.exists(opts.dna_model[0]): #dna_model[0] is either the existing model or model name, such as: CnnL2h128 CnnL2h256 log.info('Loading existing DNA model ...') dna_model = mod.load_model(opts.dna_model, log=log.info) remove_outputs(dna_model) rename_layers(dna_model, 'dna') else: log.info('Building DNA model ...') #without pre-trained model. dna_model_builder = mod.dna.get(opts.dna_model[0])( #mod.dna.get() Return object from module by its name. #the name can be CnnL2h128, CnnL2h256. the model's structure is well-defined. extract by the name of model. l1_decay=opts.l1_decay, #l1_decay: default=0.0001 l2_decay=opts.l2_decay, #l2_decay: default=0.0001 dropout=opts.dropout) #default = 0.0 dna_wlen = dat.get_dna_wlen(opts.train_files[0], opts.dna_wlen) dna_inputs = dna_model_builder.inputs(dna_wlen) dna_model = dna_model_builder(dna_inputs) return dna_model
def build_model(self): opts = self.opts log = self.log output_names = dat.get_output_names(opts.train_files[0], regex=opts.output_names, nb_key=opts.nb_output) if not output_names: raise ValueError('No outputs found!') dna_model = None if opts.dna_model: dna_model = self.build_dna_model() cpg_model = None if opts.cpg_model: cpg_model = self.build_cpg_model() if dna_model is not None and cpg_model is not None: log.info('Joining models ...') joint_model_builder = mod.joint.get(opts.joint_model)( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) stem = joint_model_builder([dna_model, cpg_model]) stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name]) elif dna_model is not None: stem = dna_model elif cpg_model is not None: stem = cpg_model else: log.info('Loading existing model ...') stem = mod.load_model(opts.model_files, log=log.info) if sorted(output_names) == sorted(stem.output_names): return stem log.info('Removing existing output layers ...') remove_outputs(stem) outputs = mod.add_output_layers(stem.outputs, output_names) model = Model(input=stem.inputs, output=outputs, name=stem.name) return model
def build_model(self): opts = self.opts log = self.log output_names = dat.get_output_names(opts.train_files[0], regex=opts.output_names, nb_key=opts.nb_output) if not output_names: raise ValueError('No outputs found!') dna_model = None if opts.dna_model: dna_model = self.build_dna_model() cpg_model = None if opts.cpg_model: cpg_model = self.build_cpg_model() if dna_model is not None and cpg_model is not None: log.info('Joining models ...') joint_model_builder = mod.joint.get(opts.joint_model)( l1_decay=opts.l1_decay, l2_decay=opts.l2_decay, dropout=opts.dropout) stem = joint_model_builder([dna_model, cpg_model]) stem.name = '_'.join([stem.name, dna_model.name, cpg_model.name]) elif dna_model is not None: stem = dna_model elif cpg_model is not None: stem = cpg_model else: log.info('Loading existing model ...') stem = mod.load_model(opts.model_files, log=log.info) if sorted(output_names) == sorted(stem.output_names): return stem log.info('Removing existing output layers ...') remove_outputs(stem) outputs = mod.add_output_layers(stem.outputs[0], output_names) model = Model(inputs=stem.inputs, outputs=outputs, name=stem.name) return model
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files, log=log.info) weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True) log.info('Using activation layer "%s"' % act_layer.name) log.info('Using weight layer "%s"' % weight_layer.name) try: dna_idx = model.input_names.index('dna') except BaseException: raise IOError('Model is not a valid DNA model!') fun_outputs = to_list(act_layer.output) if opts.store_preds: fun_outputs += to_list(model.output) fun = K.function([to_list(model.input)[dna_idx]], fun_outputs) log.info('Reading data ...') if opts.store_outputs or opts.store_preds: output_names = model.output_names else: output_names = None data_reader = mod.DataReader( output_names=output_names, use_dna=True, dna_wlen=to_list(model.input_shape)[dna_idx][1] ) nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=opts.shuffle) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file weights = weight_layer.get_weights() out_group['weights/weights'] = weights[0] out_group['weights/bias'] = weights[1] def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset( name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression ) out_group[path][idx:idx+len(data)] = data log.info('Computing activations') progbar = ProgressBar(nb_sample, log.info) idx = 0 for data in data_reader: if isinstance(data, tuple): inputs, outputs, weights = data else: inputs = data if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) if opts.store_inputs: for i, name in enumerate(model.input_names): h5_dump('inputs/%s' % name, dna.onehot_to_int(inputs[i]), idx) if opts.store_outputs: for name, output in six.iteritems(outputs): h5_dump('outputs/%s' % name, output, idx) fun_eval = fun(inputs) act = fun_eval[0] if opts.act_wlen: delta = opts.act_wlen // 2 ctr = act.shape[1] // 2 act = act[:, (ctr-delta):(ctr+delta+1)] if opts.act_fun: if opts.act_fun == 'mean': act = act.mean(axis=1) elif opts.act_fun == 'wmean': weights = linear_weights(act.shape[1]) act = np.average(act, axis=1, weights=weights) elif opts.act_fun == 'max': act = act.max(axis=1) else: raise ValueError('Invalid function "%s"!' % (opts.act_fun)) h5_dump('act', act, idx) if opts.store_preds: preds = fun_eval[1:] for i, name in enumerate(model.output_names): h5_dump('preds/%s' % name, preds[i].squeeze(), idx) for name, value in six.iteritems(next(meta_reader)): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') model = mod.load_model(opts.model_files) log.info('Loading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names(opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names, replicate_names=replicate_names) # Seed used since unobserved input CpG states are randomly sampled if opts.seed is not None: np.random.seed(opts.seed) random.seed(opts.seed) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) writer = None if opts.out_data: writer = H5Writer(opts.out_data, nb_sample) log.info('Predicting ...') nb_tot = 0 nb_eval = 0 data_eval = dict() perf_eval = [] progbar = ProgressBar(nb_sample, log.info) for inputs, outputs, weights in data_reader: batch_size = len(list(inputs.values())[0]) nb_tot += batch_size progbar.update(batch_size) preds = to_list(model.predict(inputs)) data_batch = dict() data_batch['preds'] = dict() data_batch['outputs'] = dict() for i, name in enumerate(model.output_names): data_batch['preds'][name] = preds[i].squeeze() data_batch['outputs'][name] = outputs[name].squeeze() for name, value in six.iteritems(next(meta_reader)): data_batch[name] = value if writer: writer.write_dict(data_batch) nb_eval += batch_size dat.add_to_dict(data_batch, data_eval) if nb_tot >= nb_sample or \ (opts.eval_size and nb_eval >= opts.eval_size): data_eval = dat.stack_dict(data_eval) perf_eval.append( ev.evaluate_outputs(data_eval['outputs'], data_eval['preds'])) data_eval = dict() nb_eval = 0 progbar.close() if writer: writer.close() report = pd.concat(perf_eval) report = report.groupby(['metric', 'output']).mean().reset_index() if opts.out_report: report.to_csv(opts.out_report, sep='\t', index=False) report = ev.unstack_report(report) print(report.to_string()) log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files) # Get DNA layer. dna_layer = None for layer in model.layers: if layer.name == 'dna': dna_layer = layer break if not dna_layer: raise ValueError('The provided model is not a DNA model!') # Create output vector. outputs = [] for output in model.outputs: outputs.append(K.reshape(output, (-1, 1))) outputs = K.concatenate(outputs, axis=1) # Compute gradient of outputs wrt. DNA layer. grads = [] for name in opts.targets: if name == 'mean': target = K.mean(outputs, axis=1) elif name == 'var': target = K.var(outputs, axis=1) else: raise ValueError('Invalid effect size "%s"!' % name) grad = K.gradients(target, dna_layer.output) grads.extend(grad) grad_fun = K.function(model.inputs, grads) log.info('Reading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names(opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, outputs=False, replicate_names=replicate_names) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset(name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression) out_group[path][idx:idx + len(data)] = data log.info('Computing effects ...') progbar = ProgressBar(nb_sample, log.info) idx = 0 for inputs in data_reader: if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) # Compute gradients. grads = grad_fun(inputs) # Slice window at center. if opts.dna_wlen: for i, grad in enumerate(grads): delta = opts.dna_wlen // 2 ctr = grad.shape[1] // 2 grads[i] = grad[:, (ctr - delta):(ctr + delta + 1)] # Aggregate effects in window if opts.agg_effects: for i, grad in enumerate(grads): if opts.agg_effects == 'mean': grad = grad.mean(axis=1) elif opts.agg_effects == 'wmean': weights = linear_weights(grad.shape[1]) grad = np.average(grad, axis=1, weights=weights) elif opts.agg_effects == 'max': grad = grad.max(axis=1) else: tmp = 'Invalid function "%s"!' % (opts.agg_effects) raise ValueError(tmp) grads[i] = grad # Write computed effects for name, grad in zip(opts.targets, grads): h5_dump(name, grad, idx) # Store inputs if opts.store_inputs: for name, value in zip(model.input_names, inputs): h5_dump(name, value, idx) # Store positions for name, value in next(meta_reader).items(): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files, log=log.info) weight_layer, act_layer = mod.get_first_conv_layer(model.layers, True) log.info('Using activation layer "%s"' % act_layer.name) log.info('Using weight layer "%s"' % weight_layer.name) try: dna_idx = model.input_names.index('dna') except BaseException: raise IOError('Model is not a valid DNA model!') fun_outputs = to_list(act_layer.output) if opts.store_preds: fun_outputs += to_list(model.output) fun = K.function([to_list(model.input)[dna_idx]], fun_outputs) log.info('Reading data ...') if opts.store_outputs or opts.store_preds: output_names = model.output_names else: output_names = None data_reader = mod.DataReader(output_names=output_names, use_dna=True, dna_wlen=to_list( model.input_shape)[dna_idx][1]) nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file weights = weight_layer.get_weights() out_group['weights/weights'] = weights[0] out_group['weights/bias'] = weights[1] def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset(name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression) out_group[path][idx:idx + len(data)] = data log.info('Computing activations') progbar = ProgressBar(nb_sample, log.info) idx = 0 for data in data_reader: if isinstance(data, tuple): inputs, outputs, weights = data else: inputs = data if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) if opts.store_inputs: for i, name in enumerate(model.input_names): h5_dump('inputs/%s' % name, dna.onehot_to_int(inputs[i]), idx) if opts.store_outputs: for name, output in six.iteritems(outputs): h5_dump('outputs/%s' % name, output, idx) fun_eval = fun(inputs) act = fun_eval[0] if opts.act_wlen: delta = opts.act_wlen // 2 ctr = act.shape[1] // 2 act = act[:, (ctr - delta):(ctr + delta + 1)] if opts.act_fun: if opts.act_fun == 'mean': act = act.mean(axis=1) elif opts.act_fun == 'wmean': weights = linear_weights(act.shape[1]) act = np.average(act, axis=1, weights=weights) elif opts.act_fun == 'max': act = act.max(axis=1) else: raise ValueError('Invalid function "%s"!' % (opts.act_fun)) h5_dump('act', act, idx) if opts.store_preds: preds = fun_eval[1:] for i, name in enumerate(model.output_names): h5_dump('preds/%s' % name, preds[i].squeeze(), idx) for name, value in six.iteritems(next(meta_reader)): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) if opts.seed is not None: np.random.seed(opts.seed) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') K.set_learning_phase(0) model = mod.load_model(opts.model_files) # Get DNA layer. dna_layer = None for layer in model.layers: if layer.name == 'dna': dna_layer = layer break if not dna_layer: raise ValueError('The provided model is not a DNA model!') # Create output vector. outputs = [] for output in model.outputs: outputs.append(K.reshape(output, (-1, 1))) outputs = K.concatenate(outputs, axis=1) # Compute gradient of outputs wrt. DNA layer. grads = [] for name in opts.targets: if name == 'mean': target = K.mean(outputs, axis=1) elif name == 'var': target = K.var(outputs, axis=1) else: raise ValueError('Invalid effect size "%s"!' % name) grad = K.gradients(target, dna_layer.output) grads.extend(grad) grad_fun = K.function(model.inputs, grads) log.info('Reading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names( opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, outputs=False, replicate_names=replicate_names) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) out_file = h5.File(opts.out_file, 'w') out_group = out_file def h5_dump(path, data, idx, dtype=None, compression='gzip'): if path not in out_group: if dtype is None: dtype = data.dtype out_group.create_dataset( name=path, shape=[nb_sample] + list(data.shape[1:]), dtype=dtype, compression=compression ) out_group[path][idx:idx+len(data)] = data log.info('Computing effects ...') progbar = ProgressBar(nb_sample, log.info) idx = 0 for inputs in data_reader: if isinstance(inputs, dict): inputs = list(inputs.values()) batch_size = len(inputs[0]) progbar.update(batch_size) # Compute gradients. grads = grad_fun(inputs) # Slice window at center. if opts.dna_wlen: for i, grad in enumerate(grads): delta = opts.dna_wlen // 2 ctr = grad.shape[1] // 2 grads[i] = grad[:, (ctr-delta):(ctr+delta+1)] # Aggregate effects in window if opts.agg_effects: for i, grad in enumerate(grads): if opts.agg_effects == 'mean': grad = grad.mean(axis=1) elif opts.agg_effects == 'wmean': weights = linear_weights(grad.shape[1]) grad = np.average(grad, axis=1, weights=weights) elif opts.agg_effects == 'max': grad = grad.max(axis=1) else: tmp = 'Invalid function "%s"!' % (opts.agg_effects) raise ValueError(tmp) grads[i] = grad # Write computed effects for name, grad in zip(opts.targets, grads): h5_dump(name, grad, idx) # Store inputs if opts.store_inputs: for name, value in zip(model.input_names, inputs): h5_dump(name, value, idx) # Store positions for name, value in next(meta_reader).items(): h5_dump(name, value, idx) idx += batch_size progbar.close() out_file.close() log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') model = mod.load_model(opts.model_files) log.info('Loading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names( opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names, replicate_names=replicate_names) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) log.info('Predicting ...') data = dict() progbar = ProgressBar(nb_sample, log.info) for inputs, outputs, weights in data_reader: batch_size = len(list(inputs.values())[0]) progbar.update(batch_size) preds = to_list(model.predict(inputs)) data_batch = dict() data_batch['preds'] = dict() data_batch['outputs'] = dict() for i, name in enumerate(model.output_names): data_batch['preds'][name] = preds[i].squeeze() data_batch['outputs'][name] = outputs[name].squeeze() for name, value in next(meta_reader).items(): data_batch[name] = value dat.add_to_dict(data_batch, data) progbar.close() data = dat.stack_dict(data) report = ev.evaluate_outputs(data['outputs'], data['preds']) if opts.out_report: report.to_csv(opts.out_report, sep='\t', index=False) report = ev.unstack_report(report) print(report.to_string()) if opts.out_data: hdf.write_data(data, opts.out_data) log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') model = mod.load_model(opts.model_files) log.info('Loading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names( opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names, replicate_names=replicate_names) # Seed used since unobserved input CpG states are randomly sampled if opts.seed is not None: np.random.seed(opts.seed) random.seed(opts.seed) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) writer = None if opts.out_data: writer = H5Writer(opts.out_data, nb_sample) log.info('Predicting ...') nb_tot = 0 nb_eval = 0 data_eval = dict() perf_eval = [] progbar = ProgressBar(nb_sample, log.info) for inputs, outputs, weights in data_reader: batch_size = len(list(inputs.values())[0]) nb_tot += batch_size progbar.update(batch_size) preds = to_list(model.predict(inputs)) data_batch = dict() data_batch['preds'] = dict() data_batch['outputs'] = dict() for i, name in enumerate(model.output_names): data_batch['preds'][name] = preds[i].squeeze() data_batch['outputs'][name] = outputs[name].squeeze() for name, value in six.iteritems(next(meta_reader)): data_batch[name] = value if writer: writer.write_dict(data_batch) nb_eval += batch_size dat.add_to_dict(data_batch, data_eval) if nb_tot >= nb_sample or \ (opts.eval_size and nb_eval >= opts.eval_size): data_eval = dat.stack_dict(data_eval) perf_eval.append(ev.evaluate_outputs(data_eval['outputs'], data_eval['preds'])) data_eval = dict() nb_eval = 0 progbar.close() if writer: writer.close() report = pd.concat(perf_eval) report = report.groupby(['metric', 'output']).mean().reset_index() if opts.out_report: report.to_csv(opts.out_report, sep='\t', index=False) report = ev.unstack_report(report) print(report.to_string()) log.info('Done!') return 0