def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) self.opts = opts self.log = log # Get performance curve functions from names. curve_funs = dict() if opts.curves: for name in opts.curves: curve_funs[name] = get_curve_fun(name) anno_curve_funs = dict() if opts.anno_curves: for name in opts.anno_curves: anno_curve_funs[name] = get_curve_fun(name) log.info('Loading data ...') # Read and sort predictions and outputs. output_names = dat.get_output_names(opts.data_file, regex=opts.output_names, nb_key=opts.nb_output) names = {'chromo': None, 'pos': None, 'outputs': output_names, 'preds': output_names} data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample) data['chromo'] = [chromo.decode() for chromo in data['chromo']] data['chromo'] = np.array(data['chromo']) data = fold_dict(data, nb_level=1) idx = np.lexsort((data['pos'], data['chromo'])) data = slice_dict(data, idx) for chromo in np.unique(data['chromo']): chromo_pos = data['pos'][data['chromo'] == chromo] tmp = np.sort(chromo_pos) assert np.all(chromo_pos == tmp) log.info('%d samples' % len(data['pos'])) reports = [] curves = [] log.info('Evaluating globally ...') # Evaluate performances globally. report = ev.evaluate_outputs(data['outputs'], data['preds']) report['anno'] = ANNO_GLOBAL reports.append(report) pd.set_option('display.width', 1000) print(ev.unstack_report(report)) if curve_funs: # Performance curves. for name, fun in curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = ANNO_GLOBAL curves.append(curve) if opts.anno_files: log.info('Evaluating annotations ...') # Evaluate annotations. for anno_file in opts.anno_files: anno = read_anno_file(anno_file) anno_name = os.path.splitext(os.path.basename(anno_file))[0] idx = annotate(data['chromo'], data['pos'], anno) log.info('%s: %d' % (anno_name, idx.sum())) if idx.sum() < opts.anno_min_sites: log.info('Skipping due to insufficient annotated sites!') continue # Select data at annotated sites. anno_data = slice_dict(data, idx) report = ev.evaluate_outputs(anno_data['outputs'], anno_data['preds']) report['anno'] = anno_name reports.append(report) if curve_funs: # Performance curves. for name, fun in anno_curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve( data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = anno_name curves.append(curve) make_dir(opts.out_dir) if reports: report = pd.concat(reports) report = report[['anno', 'metric', 'output', 'value']] self.save_report(report, 'metrics') if curves: curves = pd.concat(curves) curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']] self.save_report(curves, 'curves') log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') model = mod.load_model(opts.model_files) log.info('Loading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names(opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names, replicate_names=replicate_names) # Seed used since unobserved input CpG states are randomly sampled if opts.seed is not None: np.random.seed(opts.seed) random.seed(opts.seed) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) writer = None if opts.out_data: writer = H5Writer(opts.out_data, nb_sample) log.info('Predicting ...') nb_tot = 0 nb_eval = 0 data_eval = dict() perf_eval = [] progbar = ProgressBar(nb_sample, log.info) for inputs, outputs, weights in data_reader: batch_size = len(list(inputs.values())[0]) nb_tot += batch_size progbar.update(batch_size) preds = to_list(model.predict(inputs)) data_batch = dict() data_batch['preds'] = dict() data_batch['outputs'] = dict() for i, name in enumerate(model.output_names): data_batch['preds'][name] = preds[i].squeeze() data_batch['outputs'][name] = outputs[name].squeeze() for name, value in six.iteritems(next(meta_reader)): data_batch[name] = value if writer: writer.write_dict(data_batch) nb_eval += batch_size dat.add_to_dict(data_batch, data_eval) if nb_tot >= nb_sample or \ (opts.eval_size and nb_eval >= opts.eval_size): data_eval = dat.stack_dict(data_eval) perf_eval.append( ev.evaluate_outputs(data_eval['outputs'], data_eval['preds'])) data_eval = dict() nb_eval = 0 progbar.close() if writer: writer.close() report = pd.concat(perf_eval) report = report.groupby(['metric', 'output']).mean().reset_index() if opts.out_report: report.to_csv(opts.out_report, sep='\t', index=False) report = ev.unstack_report(report) print(report.to_string()) log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') model = mod.load_model(opts.model_files) log.info('Loading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names( opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names, replicate_names=replicate_names) # Seed used since unobserved input CpG states are randomly sampled if opts.seed is not None: np.random.seed(opts.seed) random.seed(opts.seed) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) writer = None if opts.out_data: writer = H5Writer(opts.out_data, nb_sample) log.info('Predicting ...') nb_tot = 0 nb_eval = 0 data_eval = dict() perf_eval = [] progbar = ProgressBar(nb_sample, log.info) for inputs, outputs, weights in data_reader: batch_size = len(list(inputs.values())[0]) nb_tot += batch_size progbar.update(batch_size) preds = to_list(model.predict(inputs)) data_batch = dict() data_batch['preds'] = dict() data_batch['outputs'] = dict() for i, name in enumerate(model.output_names): data_batch['preds'][name] = preds[i].squeeze() data_batch['outputs'][name] = outputs[name].squeeze() for name, value in six.iteritems(next(meta_reader)): data_batch[name] = value if writer: writer.write_dict(data_batch) nb_eval += batch_size dat.add_to_dict(data_batch, data_eval) if nb_tot >= nb_sample or \ (opts.eval_size and nb_eval >= opts.eval_size): data_eval = dat.stack_dict(data_eval) perf_eval.append(ev.evaluate_outputs(data_eval['outputs'], data_eval['preds'])) data_eval = dict() nb_eval = 0 progbar.close() if writer: writer.close() report = pd.concat(perf_eval) report = report.groupby(['metric', 'output']).mean().reset_index() if opts.out_report: report.to_csv(opts.out_report, sep='\t', index=False) report = ev.unstack_report(report) print(report.to_string()) log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if not opts.model_files: raise ValueError('No model files provided!') log.info('Loading model ...') model = mod.load_model(opts.model_files) log.info('Loading data ...') nb_sample = dat.get_nb_sample(opts.data_files, opts.nb_sample) replicate_names = dat.get_replicate_names( opts.data_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names, replicate_names=replicate_names) data_reader = data_reader(opts.data_files, nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) meta_reader = hdf.reader(opts.data_files, ['chromo', 'pos'], nb_sample=nb_sample, batch_size=opts.batch_size, loop=False, shuffle=False) log.info('Predicting ...') data = dict() progbar = ProgressBar(nb_sample, log.info) for inputs, outputs, weights in data_reader: batch_size = len(list(inputs.values())[0]) progbar.update(batch_size) preds = to_list(model.predict(inputs)) data_batch = dict() data_batch['preds'] = dict() data_batch['outputs'] = dict() for i, name in enumerate(model.output_names): data_batch['preds'][name] = preds[i].squeeze() data_batch['outputs'][name] = outputs[name].squeeze() for name, value in next(meta_reader).items(): data_batch[name] = value dat.add_to_dict(data_batch, data) progbar.close() data = dat.stack_dict(data) report = ev.evaluate_outputs(data['outputs'], data['preds']) if opts.out_report: report.to_csv(opts.out_report, sep='\t', index=False) report = ev.unstack_report(report) print(report.to_string()) if opts.out_data: hdf.write_data(data, opts.out_data) log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) self.opts = opts self.log = log # Get performance curve functions from names. curve_funs = dict() if opts.curves: for name in opts.curves: curve_funs[name] = get_curve_fun(name) anno_curve_funs = dict() if opts.anno_curves: for name in opts.anno_curves: anno_curve_funs[name] = get_curve_fun(name) log.info('Loading data ...') # Read and sort predictions and outputs. output_names = dat.get_output_names(opts.data_file, regex=opts.output_names, nb_key=opts.nb_output) names = { 'chromo': None, 'pos': None, 'outputs': output_names, 'preds': output_names } data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample) data['chromo'] = [chromo.decode() for chromo in data['chromo']] data['chromo'] = np.array(data['chromo']) data = fold_dict(data, nb_level=1) idx = np.lexsort((data['pos'], data['chromo'])) data = slice_dict(data, idx) for chromo in np.unique(data['chromo']): chromo_pos = data['pos'][data['chromo'] == chromo] tmp = np.sort(chromo_pos) assert np.all(chromo_pos == tmp) log.info('%d samples' % len(data['pos'])) reports = [] curves = [] log.info('Evaluating globally ...') # Evaluate performances globally. report = ev.evaluate_outputs(data['outputs'], data['preds']) report['anno'] = ANNO_GLOBAL reports.append(report) pd.set_option('display.width', 1000) print(ev.unstack_report(report)) if curve_funs: # Performance curves. for name, fun in curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = ANNO_GLOBAL curves.append(curve) if opts.anno_files: log.info('Evaluating annotations ...') # Evaluate annotations. for anno_file in opts.anno_files: anno = read_anno_file(anno_file) anno_name = os.path.splitext(os.path.basename(anno_file))[0] idx = annotate(data['chromo'], data['pos'], anno) log.info('%s: %d' % (anno_name, idx.sum())) if idx.sum() < opts.anno_min_sites: log.info('Skipping due to insufficient annotated sites!') continue # Select data at annotated sites. anno_data = slice_dict(data, idx) report = ev.evaluate_outputs(anno_data['outputs'], anno_data['preds']) report['anno'] = anno_name reports.append(report) if curve_funs: # Performance curves. for name, fun in anno_curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = anno_name curves.append(curve) make_dir(opts.out_dir) if reports: report = pd.concat(reports) report = report[['anno', 'metric', 'output', 'value']] self.save_report(report, 'metrics') if curves: curves = pd.concat(curves) curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']] self.save_report(curves, 'curves') log.info('Done!') return 0