def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) output_names = dat.get_output_names(opts.data_files[0], regex=opts.output_names) stats = OrderedDict() for name in output_names: output = hdf.read(opts.data_files, 'outputs/%s' % name, nb_sample=opts.nb_sample) output = list(output.values())[0] stats[name] = get_output_stats(output) tmp = [] for key, value in six.iteritems(stats): tmp.append(pd.DataFrame(value, index=[key])) stats = pd.concat(tmp) stats.index.name = 'output' stats.reset_index(inplace=True) print(stats.to_string()) if opts.out_tsv: stats.to_csv(opts.out_tsv, sep='\t', index=False) if opts.out_fig: plot_stats(stats).savefig(opts.out_fig) return 0
def setup_class(self): self.data_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data') self.data_files = glob(os.path.join(self.data_path, 'c*.h5')) names = ['chromo', 'pos', '/inputs/dna', '/inputs/cpg/BS27_4_SER/dist', '/inputs/cpg/BS27_4_SER/state', '/inputs/cpg/BS28_2_SER/dist', '/inputs/cpg/BS28_2_SER/state', '/inputs/annos/exons', '/inputs/annos/CGI', '/outputs/cpg/BS27_4_SER', '/outputs/cpg/BS28_2_SER', '/outputs/cpg_stats/mean', '/outputs/cpg_stats/var', '/outputs/cpg_stats/cat_var', '/outputs/cpg_stats/cat2_var', '/outputs/cpg_stats/diff', '/outputs/cpg_stats/mode', ] self.data = hdf.read(self.data_files, names) self.chromo = self.data['chromo'] self.pos = self.data['pos']
def test_read_from(self): data_files = self.data_files[:2] names = ['pos', '/outputs/cpg/BS27_4_SER'] data = hdf.read(data_files, names) for nb_sample in [99, 9999, 99999999999]: reader = hdf.reader(data_files, names) data_read = hdf.read_from(reader, nb_sample) for name in names: assert np.all(data[name][:nb_sample] == data_read[name])
def test_read(self): data_files = [self.data_files[0], self.data_files[-1]] names = ['pos', 'chromo', '/outputs/cpg/BS27_4_SER'] data = hdf.read(data_files, names, shuffle=False) assert np.all(data['chromo'][:5] == b'18') npt.assert_equal(data['pos'][:5], [3000023, 3000086, 3000092, 3000103, 3000163]) npt.assert_equal(data['/outputs/cpg/BS27_4_SER'][:5], [1, 1, 1, -1, 0]) assert np.all(data['chromo'][-5:] == b'19') npt.assert_equal(data['pos'][-5:], [4447803, 4447814, 4447818, 4447821, 4447847]) npt.assert_equal(data['/outputs/cpg/BS27_4_SER'][-5:], [1, 1, 1, 1, 1])
def test_nb_sample(self): """Test nb_sample together with shuffle. Should always return the same data, if nb_sample = size of first data file.""" names = ['pos', '/outputs/cpg/BS27_4_SER'] h5_file = h5.File(self.data_files[0], 'r') nb_sample = len(h5_file[names[0]]) h5_file.close() data_ref = None for i in range(2): data = hdf.read(self.data_files, names, shuffle=True, nb_sample=nb_sample) if not data_ref: data_ref = data idx_ref = np.argsort(data_ref['pos']) continue idx_data = np.argsort(data['pos']) for name in names: assert len(data_ref[name]) == nb_sample assert np.all(data_ref[name][idx_ref] == data[name][idx_data])
def test_read_reader(self): """Test if read and reader yield the same data.""" nb_sample = 7777 nb_loop = 10 names = ['pos', 'chromo', '/outputs/cpg/BS27_4_SER'] data = hdf.read(self.data_files, names, nb_sample=nb_sample) reader = hdf.reader(self.data_files, names, nb_sample=nb_sample, loop=True) for loop in range(nb_loop): data_loop = dict() nb_sample_loop = 0 while nb_sample_loop < nb_sample: data_batch = next(reader) for key, value in six.iteritems(data_batch): data_loop.setdefault(key, []).append(value) nb_sample_loop += len(value) assert nb_sample_loop == nb_sample for key, value in six.iteritems(data_loop): fun = np.vstack if value[0].ndim > 1 else np.hstack data_loop[key] = fun(value) assert np.all(data[key] == data_loop[key])
def setup_class(self): self.data_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data') self.data_files = [ os.path.join(self.data_path, 'c18_000000-005000.h5'), os.path.join(self.data_path, 'c18_005000-008712.h5'), os.path.join(self.data_path, 'c19_000000-005000.h5'), os.path.join(self.data_path, 'c19_005000-008311.h5') ] names = [ 'chromo', 'pos', '/inputs/dna', '/inputs/cpg/BS27_4_SER/dist', '/inputs/cpg/BS27_4_SER/state', '/inputs/cpg/BS28_2_SER/dist', '/inputs/cpg/BS28_2_SER/state', '/inputs/annos/exons', '/inputs/annos/CGI', '/outputs/cpg/BS27_4_SER', '/outputs/cpg/BS28_2_SER', '/outputs/stats/mean', '/outputs/stats/var', '/outputs/stats/cat_var', '/outputs/stats/cat2_var', '/outputs/stats/diff', '/outputs/stats/mode', '/outputs/bulk/BS9N_2I', '/outputs/bulk/BS9N_SER' ] self.data = hdf.read(self.data_files, names) self.chromo = self.data['chromo'] self.pos = self.data['pos']
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if opts.seed is not None: np.random.seed(opts.seed) random.seed(opts.seed) self.log = log self.opts = opts make_dir(opts.out_dir) log.info('Building model ...') model = self.build_model() model.summary() self.set_trainability(model) if opts.filter_weights: conv_layer = mod.get_first_conv_layer(model.layers) log.info('Initializing filters of %s ...' % conv_layer.name) self.init_filter_weights(opts.filter_weights, conv_layer) mod.save_model(model, os.path.join(opts.out_dir, 'model.json')) log.info('Computing output statistics ...') output_names = [] for output_layer in model.output_layers: output_names.append(output_layer.name) output_stats = OrderedDict() if opts.no_class_weights: class_weights = None else: class_weights = OrderedDict() for name in output_names: output = hdf.read(opts.train_files, 'outputs/%s' % name, nb_sample=opts.nb_train_sample) output = list(output.values())[0] output_stats[name] = get_output_stats(output) if class_weights is not None: class_weights[name] = get_output_class_weights(name, output) self.print_output_stats(output_stats) if class_weights: self.print_class_weights(class_weights) output_weights = None if opts.output_weights: log.info('Initializing output weights ...') output_weights = get_output_weights(output_names, opts.output_weights) print('Output weights:') for output_name in output_names: if output_name in output_weights: print('%s: %.2f' % (output_name, output_weights[output_name])) print() self.metrics = dict() for output_name in output_names: self.metrics[output_name] = get_metrics(output_name) optimizer = Adam(lr=opts.learning_rate) model.compile(optimizer=optimizer, loss=mod.get_objectives(output_names), loss_weights=output_weights, metrics=self.metrics) log.info('Loading data ...') replicate_names = dat.get_replicate_names(opts.train_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names=replicate_names) nb_train_sample = dat.get_nb_sample(opts.train_files, opts.nb_train_sample) train_data = data_reader(opts.train_files, class_weights=class_weights, batch_size=opts.batch_size, nb_sample=nb_train_sample, shuffle=True, loop=True) if opts.val_files: nb_val_sample = dat.get_nb_sample(opts.val_files, opts.nb_val_sample) val_data = data_reader(opts.val_files, batch_size=opts.batch_size, nb_sample=nb_val_sample, shuffle=False, loop=True) else: val_data = None nb_val_sample = None log.info('Initializing callbacks ...') callbacks = self.get_callbacks() log.info('Training model ...') print() print('Training samples: %d' % nb_train_sample) if nb_val_sample: print('Validation samples: %d' % nb_val_sample) model.fit_generator(train_data, nb_train_sample, opts.nb_epoch, callbacks=callbacks, validation_data=val_data, nb_val_samples=nb_val_sample, max_q_size=opts.data_q_size, nb_worker=opts.data_nb_worker, verbose=0) print('\nTraining set performance:') print( format_table(self.perf_logger.epoch_logs, precision=LOG_PRECISION)) if self.perf_logger.val_epoch_logs: print('\nValidation set performance:') print( format_table(self.perf_logger.val_epoch_logs, precision=LOG_PRECISION)) # Restore model with highest validation performance filename = os.path.join(opts.out_dir, 'model_weights_val.h5') if os.path.isfile(filename): model.load_weights(filename) # Delete metrics since they cause problems when loading the model # from HDF5 file. Metrics can be loaded from json + weights file. model.metrics = None model.metrics_names = None model.metrics_tensors = None model.save(os.path.join(opts.out_dir, 'model.h5')) log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) self.opts = opts self.log = log # Get performance curve functions from names. curve_funs = dict() if opts.curves: for name in opts.curves: curve_funs[name] = get_curve_fun(name) anno_curve_funs = dict() if opts.anno_curves: for name in opts.anno_curves: anno_curve_funs[name] = get_curve_fun(name) log.info('Loading data ...') # Read and sort predictions and outputs. output_names = dat.get_output_names(opts.data_file, regex=opts.output_names, nb_key=opts.nb_output) names = {'chromo': None, 'pos': None, 'outputs': output_names, 'preds': output_names} data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample) data['chromo'] = [chromo.decode() for chromo in data['chromo']] data['chromo'] = np.array(data['chromo']) data = fold_dict(data, nb_level=1) idx = np.lexsort((data['pos'], data['chromo'])) data = slice_dict(data, idx) for chromo in np.unique(data['chromo']): chromo_pos = data['pos'][data['chromo'] == chromo] tmp = np.sort(chromo_pos) assert np.all(chromo_pos == tmp) log.info('%d samples' % len(data['pos'])) reports = [] curves = [] log.info('Evaluating globally ...') # Evaluate performances globally. report = ev.evaluate_outputs(data['outputs'], data['preds']) report['anno'] = ANNO_GLOBAL reports.append(report) pd.set_option('display.width', 1000) print(ev.unstack_report(report)) if curve_funs: # Performance curves. for name, fun in curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = ANNO_GLOBAL curves.append(curve) if opts.anno_files: log.info('Evaluating annotations ...') # Evaluate annotations. for anno_file in opts.anno_files: anno = read_anno_file(anno_file) anno_name = os.path.splitext(os.path.basename(anno_file))[0] idx = annotate(data['chromo'], data['pos'], anno) log.info('%s: %d' % (anno_name, idx.sum())) if idx.sum() < opts.anno_min_sites: log.info('Skipping due to insufficient annotated sites!') continue # Select data at annotated sites. anno_data = slice_dict(data, idx) report = ev.evaluate_outputs(anno_data['outputs'], anno_data['preds']) report['anno'] = anno_name reports.append(report) if curve_funs: # Performance curves. for name, fun in anno_curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve( data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = anno_name curves.append(curve) make_dir(opts.out_dir) if reports: report = pd.concat(reports) report = report[['anno', 'metric', 'output', 'value']] self.save_report(report, 'metrics') if curves: curves = pd.concat(curves) curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']] self.save_report(curves, 'curves') log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) if opts.seed is not None: np.random.seed(opts.seed) random.seed(opts.seed) self.log = log self.opts = opts make_dir(opts.out_dir) log.info('Building model ...') model = self.build_model() model.summary() self.set_trainability(model) if opts.filter_weights: conv_layer = mod.get_first_conv_layer(model.layers) log.info('Initializing filters of %s ...' % conv_layer.name) self.init_filter_weights(opts.filter_weights, conv_layer) mod.save_model(model, os.path.join(opts.out_dir, 'model.json')) log.info('Computing output statistics ...') output_names = model.output_names output_stats = OrderedDict() if opts.no_class_weights: class_weights = None else: class_weights = OrderedDict() for name in output_names: output = hdf.read(opts.train_files, 'outputs/%s' % name, nb_sample=opts.nb_train_sample) output = list(output.values())[0] output_stats[name] = get_output_stats(output) if class_weights is not None: class_weights[name] = get_output_class_weights(name, output) self.print_output_stats(output_stats) if class_weights: self.print_class_weights(class_weights) output_weights = None if opts.output_weights: log.info('Initializing output weights ...') output_weights = get_output_weights(output_names, opts.output_weights) print('Output weights:') for output_name in output_names: if output_name in output_weights: print('%s: %.2f' % (output_name, output_weights[output_name])) print() self.metrics = dict() for output_name in output_names: self.metrics[output_name] = get_metrics(output_name) optimizer = Adam(lr=opts.learning_rate) model.compile(optimizer=optimizer, loss=mod.get_objectives(output_names), loss_weights=output_weights, metrics=self.metrics) log.info('Loading data ...') replicate_names = dat.get_replicate_names( opts.train_files[0], regex=opts.replicate_names, nb_key=opts.nb_replicate) data_reader = mod.data_reader_from_model( model, replicate_names=replicate_names) nb_train_sample = dat.get_nb_sample(opts.train_files, opts.nb_train_sample) train_data = data_reader(opts.train_files, class_weights=class_weights, batch_size=opts.batch_size, nb_sample=nb_train_sample, shuffle=True, loop=True) if opts.val_files: nb_val_sample = dat.get_nb_sample(opts.val_files, opts.nb_val_sample) val_data = data_reader(opts.val_files, batch_size=opts.batch_size, nb_sample=nb_val_sample, shuffle=False, loop=True) else: val_data = None nb_val_sample = None log.info('Initializing callbacks ...') callbacks = self.get_callbacks() log.info('Training model ...') print() print('Training samples: %d' % nb_train_sample) if nb_val_sample: print('Validation samples: %d' % nb_val_sample) model.fit_generator( train_data, steps_per_epoch=nb_train_sample // opts.batch_size, epochs=opts.nb_epoch, callbacks=callbacks, validation_data=val_data, validation_steps=nb_val_sample // opts.batch_size, max_queue_size=opts.data_q_size, workers=opts.data_nb_worker, verbose=0) print('\nTraining set performance:') print(format_table(self.perf_logger.epoch_logs, precision=LOG_PRECISION)) if self.perf_logger.val_epoch_logs: print('\nValidation set performance:') print(format_table(self.perf_logger.val_epoch_logs, precision=LOG_PRECISION)) # Restore model with highest validation performance filename = os.path.join(opts.out_dir, 'model_weights_val.h5') if os.path.isfile(filename): model.load_weights(filename) # Delete metrics since they cause problems when loading the model # from HDF5 file. Metrics can be loaded from json + weights file. model.metrics = None model.metrics_names = None model.metrics_tensors = None model.save(os.path.join(opts.out_dir, 'model.h5')) log.info('Done!') return 0
def main(self, name, opts): logging.basicConfig(filename=opts.log_file, format='%(levelname)s (%(asctime)s): %(message)s') log = logging.getLogger(name) if opts.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) log.debug(opts) self.opts = opts self.log = log # Get performance curve functions from names. curve_funs = dict() if opts.curves: for name in opts.curves: curve_funs[name] = get_curve_fun(name) anno_curve_funs = dict() if opts.anno_curves: for name in opts.anno_curves: anno_curve_funs[name] = get_curve_fun(name) log.info('Loading data ...') # Read and sort predictions and outputs. output_names = dat.get_output_names(opts.data_file, regex=opts.output_names, nb_key=opts.nb_output) names = { 'chromo': None, 'pos': None, 'outputs': output_names, 'preds': output_names } data = hdf.read(opts.data_file, names, nb_sample=opts.nb_sample) data['chromo'] = [chromo.decode() for chromo in data['chromo']] data['chromo'] = np.array(data['chromo']) data = fold_dict(data, nb_level=1) idx = np.lexsort((data['pos'], data['chromo'])) data = slice_dict(data, idx) for chromo in np.unique(data['chromo']): chromo_pos = data['pos'][data['chromo'] == chromo] tmp = np.sort(chromo_pos) assert np.all(chromo_pos == tmp) log.info('%d samples' % len(data['pos'])) reports = [] curves = [] log.info('Evaluating globally ...') # Evaluate performances globally. report = ev.evaluate_outputs(data['outputs'], data['preds']) report['anno'] = ANNO_GLOBAL reports.append(report) pd.set_option('display.width', 1000) print(ev.unstack_report(report)) if curve_funs: # Performance curves. for name, fun in curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = ANNO_GLOBAL curves.append(curve) if opts.anno_files: log.info('Evaluating annotations ...') # Evaluate annotations. for anno_file in opts.anno_files: anno = read_anno_file(anno_file) anno_name = os.path.splitext(os.path.basename(anno_file))[0] idx = annotate(data['chromo'], data['pos'], anno) log.info('%s: %d' % (anno_name, idx.sum())) if idx.sum() < opts.anno_min_sites: log.info('Skipping due to insufficient annotated sites!') continue # Select data at annotated sites. anno_data = slice_dict(data, idx) report = ev.evaluate_outputs(anno_data['outputs'], anno_data['preds']) report['anno'] = anno_name reports.append(report) if curve_funs: # Performance curves. for name, fun in anno_curve_funs.items(): log.info('%s curve' % name) curve = ev.evaluate_curve(data['outputs'], data['preds'], fun=fun, nb_point=opts.nb_curve_point) if curve is not None: curve['curve'] = name curve['anno'] = anno_name curves.append(curve) make_dir(opts.out_dir) if reports: report = pd.concat(reports) report = report[['anno', 'metric', 'output', 'value']] self.save_report(report, 'metrics') if curves: curves = pd.concat(curves) curves = curves[['anno', 'curve', 'output', 'x', 'y', 'thr']] self.save_report(curves, 'curves') log.info('Done!') return 0