def main(): usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>' parser = OptionParser(usage) parser.add_option( '--ai', dest='accuracy_indexes', help= 'Comma-separated list of target indexes to make accuracy scatter plots.' ) parser.add_option('--mc', dest='mc_n', default=0, type='int', help='Monte carlo test iterations [Default: %default]') parser.add_option( '--peak', '--peaks', dest='peaks', default=False, action='store_true', help='Compute expensive peak accuracy [Default: %default]') parser.add_option( '-o', dest='out_dir', default='test_out', help='Output directory for test statistics [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help='Average the fwd and rc predictions [Default: %default]') parser.add_option( '--save', dest='save', default=False, action='store_true', help='Save targets and predictions numpy arrays [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option( '--tfr', dest='tfr_pattern', default='test-*.tfr', help='TFR pattern string appended to data_dir [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters, model, and test data HDF5') else: params_file = args[0] model_file = args[1] data_dir = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # parse shifts to integers options.shifts = [int(shift) for shift in options.shifts.split(',')] ####################################################### # inputs # read targets if options.targets_file is None: options.targets_file = '%s/targets.txt' % data_dir targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t') # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # construct data ops tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern) eval_data = dataset.SeqDataset(tfr_pattern_path, seq_length=data_stats['seq_length'], target_length=data_stats['target_length'], batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.EVAL) # initialize model seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) seqnn_model.build_ensemble(options.rc, options.shifts) ####################################################### # evaluate eval_loss = params_train.get('loss', 'poisson') # evaluate test_loss, test_pr, test_r2 = seqnn_model.evaluate(eval_data, loss=eval_loss) print('') # print summary statistics print('Test Loss: %7.5f' % test_loss) print('Test R2: %7.5f' % test_r2.mean()) print('Test PearsonR: %7.5f' % test_pr.mean()) # write target-level statistics targets_acc_df = pd.DataFrame({ 'index': targets_df.index, 'r2': test_r2, 'pearsonr': test_pr, 'identifier': targets_df.identifier, 'description': targets_df.description }) targets_acc_df.to_csv('%s/acc.txt' % options.out_dir, sep='\t', index=False, float_format='%.5f') ####################################################### # predict? if options.save or options.peaks or options.accuracy_indexes is not None: # compute predictions test_preds = seqnn_model.predict(eval_data).astype('float16') # read targets test_targets = eval_data.numpy(return_inputs=False) if options.save: preds_h5 = h5py.File('%s/preds.h5' % options.out_dir, 'w') preds_h5.create_dataset('preds', data=test_preds) preds_h5.close() targets_h5 = h5py.File('%s/targets.h5' % options.out_dir, 'w') targets_h5.create_dataset('targets', data=test_targets) targets_h5.close() ####################################################### # peak call accuracy if options.peaks: peaks_out_file = '%s/peaks.txt' % options.out_dir test_peaks(test_preds, test_targets, peaks_out_file) ####################################################### # accuracy plots if options.accuracy_indexes is not None: accuracy_indexes = [ int(ti) for ti in options.accuracy_indexes.split(',') ] if not os.path.isdir('%s/scatter' % options.out_dir): os.mkdir('%s/scatter' % options.out_dir) if not os.path.isdir('%s/violin' % options.out_dir): os.mkdir('%s/violin' % options.out_dir) if not os.path.isdir('%s/roc' % options.out_dir): os.mkdir('%s/roc' % options.out_dir) if not os.path.isdir('%s/pr' % options.out_dir): os.mkdir('%s/pr' % options.out_dir) for ti in accuracy_indexes: test_targets_ti = test_targets[:, :, ti] ############################################ # scatter # sample every few bins (adjust to plot the # points I want) ds_indexes = np.arange(0, test_preds.shape[1], 8) # subset and flatten test_targets_ti_flat = test_targets_ti[:, ds_indexes].flatten( ).astype('float32') test_preds_ti_flat = test_preds[:, ds_indexes, ti].flatten().astype('float32') # take log2 test_targets_ti_log = np.log2(test_targets_ti_flat + 1) test_preds_ti_log = np.log2(test_preds_ti_flat + 1) # plot log2 sns.set(font_scale=1.2, style='ticks') out_pdf = '%s/scatter/t%d.pdf' % (options.out_dir, ti) plots.regplot(test_targets_ti_log, test_preds_ti_log, out_pdf, poly_order=1, alpha=0.3, sample=500, figsize=(6, 6), x_label='log2 Experiment', y_label='log2 Prediction', table=True) ############################################ # violin # call peaks test_targets_ti_lambda = np.mean(test_targets_ti_flat) test_targets_pvals = 1 - poisson.cdf( np.round(test_targets_ti_flat) - 1, mu=test_targets_ti_lambda) test_targets_qvals = np.array(ben_hoch(test_targets_pvals)) test_targets_peaks = test_targets_qvals < 0.01 test_targets_peaks_str = np.where(test_targets_peaks, 'Peak', 'Background') # violin plot sns.set(font_scale=1.3, style='ticks') plt.figure() df = pd.DataFrame({ 'log2 Prediction': np.log2(test_preds_ti_flat + 1), 'Experimental coverage status': test_targets_peaks_str }) ax = sns.violinplot(x='Experimental coverage status', y='log2 Prediction', data=df) ax.grid(True, linestyle=':') plt.savefig('%s/violin/t%d.pdf' % (options.out_dir, ti)) plt.close() # ROC plt.figure() fpr, tpr, _ = roc_curve(test_targets_peaks, test_preds_ti_flat) auroc = roc_auc_score(test_targets_peaks, test_preds_ti_flat) plt.plot([0, 1], [0, 1], c='black', linewidth=1, linestyle='--', alpha=0.7) plt.plot(fpr, tpr, c='black') ax = plt.gca() ax.set_xlabel('False positive rate') ax.set_ylabel('True positive rate') ax.text(0.99, 0.02, 'AUROC %.3f' % auroc, horizontalalignment='right') # , fontsize=14) ax.grid(True, linestyle=':') plt.savefig('%s/roc/t%d.pdf' % (options.out_dir, ti)) plt.close() # PR plt.figure() prec, recall, _ = precision_recall_curve(test_targets_peaks, test_preds_ti_flat) auprc = average_precision_score(test_targets_peaks, test_preds_ti_flat) plt.axhline(y=test_targets_peaks.mean(), c='black', linewidth=1, linestyle='--', alpha=0.7) plt.plot(recall, prec, c='black') ax = plt.gca() ax.set_xlabel('Recall') ax.set_ylabel('Precision') ax.text(0.99, 0.95, 'AUPRC %.3f' % auprc, horizontalalignment='right') # , fontsize=14) ax.grid(True, linestyle=':') plt.savefig('%s/pr/t%d.pdf' % (options.out_dir, ti)) plt.close()
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>' parser = OptionParser(usage) parser.add_option( '-b', dest='track_bed', help='BED file describing regions so we can output BigWig tracks') parser.add_option('-g', dest='genome_file', default='%s/tutorials/data/human.hg19.genome' % os.environ['BASENJIDIR'], help='Chromosome length information [Default: %default]') parser.add_option('--mc', dest='mc_n', default=0, type='int', help='Monte carlo test iterations [Default: %default]') parser.add_option( '-o', dest='out_dir', default='test_out', help='Output directory for test statistics [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help='Average the fwd and rc predictions [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option( '--ti', dest='track_indexes', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '--tfr', dest='tfr_pattern', default='test-*.tfr', help='TFR pattern string appended to data_dir [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters, model, and test data HDF5') else: params_file = args[0] model_file = args[1] data_dir = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # parse shifts to integers options.shifts = [int(shift) for shift in options.shifts.split(',')] # read targets if options.targets_file is None: options.targets_file = '%s/targets.txt' % data_dir targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t') target_subset = None else: targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t') target_subset = targets_df.index # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # construct data ops tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern) eval_data = dataset.SeqDataset(tfr_pattern_path, seq_length=params_model['seq_length'], target_length=data_stats['target_length'], batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.EVAL) # initialize model seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) seqnn_model.build_ensemble(options.rc, options.shifts) # predict test_preds = seqnn_model.predict(eval_data, verbose=1).astype('float16') # save preds_h5 = h5py.File('%s/preds.h5' % options.out_dir, 'w') preds_h5.create_dataset('preds', data=test_preds) preds_h5.close() # print normalization factors target_means = test_preds.mean(axis=(0, 1), dtype='float64') target_means_median = np.median(target_means) # target_means /= target_means_median norm_out = open('%s/normalization.txt' % options.out_dir, 'w') # print('\n'.join([str(tu) for tu in target_means]), file=norm_out) for ti in range(len(target_means)): print(ti, target_means[ti], target_means_median / target_means[ti], file=norm_out) norm_out.close() ####################################################### # BigWig tracks # print bigwig tracks for visualization if options.track_bed: if options.genome_file is None: parser.error( 'Must provide genome file in order to print valid BigWigs.') if not os.path.isdir('%s/tracks' % options.out_dir): os.mkdir('%s/tracks' % options.out_dir) track_indexes = range(test_preds.shape[2]) if options.track_indexes: track_indexes = [ int(ti) for ti in options.track_indexes.split(',') ] for ti in track_indexes: # make predictions bigwig bw_file = '%s/tracks/t%d_preds.bw' % (options.out_dir, ti) bigwig_write(bw_file, test_preds[:, :, ti], options.track_bed, options.genome_file, model.hp.batch_buffer)
def main(): usage = 'usage: %prog [options] <params_file> <data1_dir> <data2_dir> ...' parser = OptionParser(usage) parser.add_option( '-o', dest='out_dir', default='train2_out', help='Output directory for test statistics [Default: %default]') parser.add_option( '--restore', dest='restore', help='Restore model and continue training [Default: %default]') parser.add_option('--trunk', dest='trunk', default=False, action='store_true', help='Restore only model trunk [Default: %default]') parser.add_option( '--tfr_train', dest='tfr_train_pattern', default=None, help= 'Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]' ) parser.add_option( '--tfr_eval', dest='tfr_eval_pattern', default=None, help= 'Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]' ) (options, args) = parser.parse_args() if len(args) < 2: parser.error('Must provide parameters and data directory.') else: params_file = args[0] data_dirs = args[1:] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if params_file != '%s/params.json' % options.out_dir: shutil.copy(params_file, '%s/params.json' % options.out_dir) # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # read datasets train_data = [] eval_data = [] for data_dir in data_dirs: # load train data train_data.append( dataset.SeqDataset(data_dir, split_label='train', batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.TRAIN, tfr_pattern=options.tfr_train_pattern)) # load eval data eval_data.append( dataset.SeqDataset(data_dir, split_label='valid', batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.EVAL, tfr_pattern=options.tfr_eval_pattern)) if params_train.get('num_gpu', 1) == 1: ######################################## # one GPU # initialize model seqnn_model = seqnn.SeqNN(params_model) # restore if options.restore: seqnn_model.restore(options.restore, options.trunk) # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data, options.out_dir) # compile model seqnn_trainer.compile(seqnn_model) # train model seqnn_trainer.fit2(seqnn_model) else: ######################################## # two GPU strategy = tf.distribute.MirroredStrategy() with strategy.scope(): # distribute data for di in range(len(data_dirs)): train_data[di].distribute(strategy) eval_data[di].distribute(strategy) # initialize model seqnn_model = seqnn.SeqNN(params_model) # restore if options.restore: seqnn_model.restore(options.restore, options.trunk) # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data, options.out_dir, strategy, params_train['num_gpu']) # compile model seqnn_trainer.compile(seqnn_model) # train model seqnn_trainer.fit2(seqnn_model)
def main(): usage = 'usage: %prog [options] <params_file> <data1_dir> <data2_dir> ...' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='train2_out', help='Output directory for test statistics [Default: %default]') parser.add_option('--restore', dest='restore', help='Restore model and continue training [Default: %default]') parser.add_option('--trunk', dest='trunk', default=False, action='store_true', help='Restore only model trunk [Default: %default]') parser.add_option('--tfr_train', dest='tfr_train_pattern', default='train-*.tfr', help='Training TFRecord pattern string appended to data_dir [Default: %default]') parser.add_option('--tfr_eval', dest='tfr_eval_pattern', default='valid-*.tfr', help='Evaluation TFRecord pattern string appended to data_dir [Default: %default]') (options, args) = parser.parse_args() if len(args) < 2: parser.error('Must provide parameters and data directory.') else: params_file = args[0] data_dirs = args[1:] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if params_file != '%s/params.json' % options.out_dir: shutil.copy(params_file, '%s/params.json' % options.out_dir) # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # read datasets data_stats = [] train_data = [] eval_data = [] for data_dir in data_dirs: # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats.append(json.load(data_stats_open)) # load train data tfr_train_full = '%s/tfrecords/%s' % (data_dir, options.tfr_train_pattern) train_data.append(dataset.SeqDataset(tfr_train_full, seq_length=data_stats[0]['seq_length'], target_length=data_stats[0]['target_length'], batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.TRAIN)) # load eval data tfr_eval_full = '%s/tfrecords/%s' % (data_dir, options.tfr_eval_pattern) eval_data.append(dataset.SeqDataset(tfr_eval_full, seq_length=data_stats[0]['seq_length'], target_length=data_stats[0]['target_length'], batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.EVAL)) if params_train.get('num_gpu', 1) == 1: ######################################## # one GPU # initialize model seqnn_model = seqnn.SeqNN(params_model) # restore if options.restore: seqnn_model.restore(options.restore, options.trunk) # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data, options.out_dir) # compile model seqnn_trainer.compile(seqnn_model) # train model seqnn_trainer.fit2(seqnn_model) else: ######################################## # two GPU print('Multiple GPUs untested for joint genome training.', file=sys.stderr) exit(1) mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): # initialize model seqnn_model = seqnn.SeqNN(params_model) # restore if options.restore: seqnn_model.restore(options.restore, options.trunk) # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data, options.out_dir) # compile model seqnn_trainer.compile(seqnn_model) # train model seqnn_trainer.fit2(seqnn_model)
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>' parser = OptionParser(usage) parser.add_option('--head', dest='head_i', default=0, type='int', help='Parameters head to test [Default: %default]') parser.add_option( '-o', dest='out_dir', default='test_out', help='Output directory for test statistics [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help='Average the fwd and rc predictions [Default: %default]') parser.add_option( '--save', dest='save', default=False, action='store_true', help='Save targets and predictions numpy arrays [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option( '--split', dest='split_label', default='test', help='Dataset split label for eg TFR pattern [Default: %default]') parser.add_option( '--tfr', dest='tfr_pattern', default=None, help= 'TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]' ) parser.add_option( '-v', dest='high_var_pct', default=1.0, type='float', help='Highly variable site proportion to take [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters, model, and test data HDF5') else: params_file = args[0] model_file = args[1] data_dir = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # parse shifts to integers options.shifts = [int(shift) for shift in options.shifts.split(',')] ####################################################### # targets # read table if options.targets_file is None: options.targets_file = '%s/targets.txt' % data_dir targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t') num_targets = targets_df.shape[0] # classify target_classes = [] for ti in range(num_targets): description = targets_df.iloc[ti].description if description.find(':') == -1: tc = '*' else: desc_split = description.split(':') if desc_split[0] == 'CHIP': tc = '/'.join(desc_split[:2]) else: tc = desc_split[0] target_classes.append(tc) targets_df['class'] = target_classes target_classes = sorted(set(target_classes)) print(target_classes) ####################################################### # model # read parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # construct eval data eval_data = dataset.SeqDataset(data_dir, split_label=options.split_label, batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.EVAL, tfr_pattern=options.tfr_pattern) # initialize model seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file, options.head_i) seqnn_model.build_ensemble(options.rc, options.shifts) seqnn_model.downcast() ####################################################### # targets/predictions # option to read from disk? # predict eval_preds = seqnn_model.predict(eval_data, verbose=1) print('') # targets eval_targets = eval_data.numpy(return_inputs=False, return_outputs=True) # flatten eval_preds = np.reshape(eval_preds, (-1, num_targets)) eval_targets = np.reshape(eval_targets, (-1, num_targets)) ####################################################### # process classes targets_spec = np.zeros(num_targets) for tc in target_classes: class_mask = np.array(targets_df['class'] == tc) num_targets_class = class_mask.sum() if num_targets_class == 1: targets_spec[class_mask] = np.nan else: # slice class eval_preds_class = eval_preds[:, class_mask].astype('float32') eval_targets_class = eval_targets[:, class_mask].astype('float32') # highly variable filter if options.high_var_pct < 1: eval_targets_var = eval_targets_class.var(axis=1) high_var_t = np.percentile(eval_targets_var, 100 * (1 - options.high_var_pct)) high_var_mask = (eval_targets_var >= high_var_t) eval_preds_class = eval_preds_class[high_var_mask] eval_targets_class = eval_targets_class[high_var_mask] # quantile normalize eval_preds_norm = quantile_normalize(eval_preds_class) eval_targets_norm = quantile_normalize(eval_targets_class) # mean normalize eval_preds_norm = eval_preds_norm - eval_preds_norm.mean( axis=-1, keepdims=True) eval_targets_norm = eval_targets_norm - eval_targets_norm.mean( axis=-1, keepdims=True) # compute correlations pearsonr_class = np.zeros(num_targets_class) for ti in range(num_targets_class): pearsonr_class[ti] = pearsonr(eval_preds_norm[:, ti], eval_targets_norm[:, ti])[0] # save targets_spec[class_mask] = pearsonr_class # print print('%-15s %4d %.4f' % (tc, num_targets_class, pearsonr_class[ti])) # write target-level statistics targets_acc_df = pd.DataFrame({ 'index': targets_df.index, 'pearsonr': targets_spec, 'identifier': targets_df.identifier, 'description': targets_df.description }) targets_acc_df.to_csv('%s/acc.txt' % options.out_dir, sep='\t', index=False, float_format='%.5f')
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>' parser = OptionParser(usage) parser.add_option( '-a', dest='act_t', default=0.5, type='float', help= 'Activation threshold (as proportion of max) to consider for PWM [Default: %default]' ) parser.add_option( '-d', dest='plot_density', default=False, action='store_true', help='Plot filter activation density [Default: %default]') parser.add_option( '--heat', dest='plot_heats', default=False, action='store_true', help= 'Plot heat maps describing filter activations in the test sequences [Default: %default]' ) parser.add_option( '-l', dest='seq_length_crop', default=None, type='int', help='Crop sequences to shorter length [Default: %default]') parser.add_option('-o', dest='out_dir', default='basenji_motifs') parser.add_option('-m', dest='meme_db', default='%s/cisbp/Homo_sapiens.meme' % os.environ['HG38'], help='MEME database used to annotate motifs') parser.add_option( '-p', dest='parallel_threads', default=1, type='int', help='Generate weblogos in parallal threads [Default: %default]') parser.add_option( '-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]') parser.add_option( '-t', dest='trim_filters', default=False, action='store_true', help= 'Trim uninformative positions off the filter ends [Default: %default]') parser.add_option( '--tfr', dest='tfr_pattern', default='test-*.tfr', help='TFR pattern string appended to data_dir [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error( 'Must provide Basenji params and model files and data directory') else: params_file = args[0] model_file = args[1] data_dir = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) ####################################################### # inputs # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] if options.seq_length_crop is not None: params_model['seq_length'] = options.seq_length_crop # read data parameters data_stats_file = '%s/statistics.json' % data_dir with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) # construct data ops tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern) eval_data = dataset.SeqDataset(tfr_pattern_path, seq_length=data_stats['seq_length'], seq_length_crop=options.seq_length_crop, target_length=data_stats['target_length'], batch_size=params_train['batch_size'], mode=tf.estimator.ModeKeys.EVAL) # obtain sequences eval_seqs_1hot = eval_data.numpy(return_inputs=True, return_outputs=False) eval_seqs_dna = dna_io.hot1_dna(eval_seqs_1hot) del eval_seqs_1hot ################################################################# # model # initialize model seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file) # first layer embedding seqnn_model.build_embed(0) _, preds_length, preds_depth = seqnn_model.embed.output.shape # get weights filter_weights = seqnn_model.get_conv_weights() print(filter_weights.shape) num_filters, _, filter_size = filter_weights.shape # compute filter activations filter_outs = seqnn_model.predict(eval_data) print(filter_outs.shape) ################################################################# # individual filter plots # save information contents filters_ic = [] meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir, eval_seqs_dna) # plot weblogo of high scoring outputs (in parallel) if options.parallel_threads > 1: pfl_args = [] for f in range(num_filters): pfl_args.append( (filter_outs[:, :, f], filter_size, eval_seqs_dna, '%s/filter%d_logo' % (options.out_dir, f), options.act_t)) with multiprocessing.get_context('spawn').Pool( options.parallel_threads) as pool: pool.starmap(plot_filter_logo, pfl_args) for f in range(num_filters): print('Filter %d' % f) # plot filter parameters as a heatmap plot_filter_heat(filter_weights[f, :, :], '%s/filter%d_heat.pdf' % (options.out_dir, f)) if options.parallel_threads == 1: plot_filter_logo(filter_outs[:, :, f], filter_size, eval_seqs_dna, '%s/filter%d_logo' % (options.out_dir, f), options.act_t) # write possum motif file # filter_possum(filter_weights[f, :, :], 'filter%d' % f, # '%s/filter%d_possum.txt' % (options.out_dir, # f), options.trim_filters) # make a PWM for the filter filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' % (options.out_dir, f)) if nsites < 10: # no information filters_ic.append(0) else: # compute and save information content filters_ic.append(info_content(filter_pwm)) # add to the meme motif file meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters) meme_out.close() ################################################################# # annotate filters ################################################################# # run tomtom subprocess.call( 'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s' % (options.out_dir, options.out_dir, options.meme_db), shell=True) # read in annotations filter_names = name_filters(num_filters, '%s/tomtom/tomtom.tsv' % options.out_dir, options.meme_db) ################################################################# # print a table of information ################################################################# table_out = open('%s/table.txt' % options.out_dir, 'w') # print header for later panda reading header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std') print('%3s %19s %10s %5s %6s %6s' % header_cols, file=table_out) for f in range(num_filters): # collapse to a consensus motif consensus = filter_motif(filter_weights[f, :, :]) # grab annotation annotation = '.' name_pieces = filter_names[f].split('_') if len(name_pieces) > 1: annotation = name_pieces[1] f_scores = np.ravel(filter_outs[:, :, f]) fmean, fstd = f_scores.mean(), f_scores.std() if options.plot_density: # plot density of filter output scores plot_score_density(f_scores, '%s/filter%d_dens.pdf' % (options.out_dir, f)) row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd) print('%-3d %19s %10s %5.2f %6.4f %6.4f' % row_cols, file=table_out) table_out.close() ################################################################# # global filter plots ################################################################# # these methods make less sense for longer sequences; # I should fragment the sequences first. if options.plot_heats: # plot filter-sequence heatmap plot_filter_seq_heat(filter_outs, '%s/filter_seqs.pdf' % options.out_dir) # plot filter-segment heatmap plot_filter_seg_heat(filter_outs, '%s/filter_segs.pdf' % options.out_dir) plot_filter_seg_heat(filter_outs, '%s/filter_segs_raw.pdf' % options.out_dir, whiten=False) # plot filter-target correlation heatmap plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_mean.pdf' % options.out_dir, 'mean') plot_target_corr(filter_outs, seq_targets, filter_names, target_names, '%s/filter_target_cors_max.pdf' % options.out_dir, 'max')
def main(_): # I could write some additional code around this to check for common # problems, such as with num_targets. with open(FLAGS.params) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] if params_train.get('use_gpu', 1) == False: os.environ["CUDA_VISIBLE_DEVICES"] = '-1' print(" ") print(" training on CPU ") print(" ") #need to blind to CPUs before tf is imported import shutil if not os.path.isdir(FLAGS.log_dir): os.mkdir(FLAGS.log_dir) if not os.path.isfile(FLAGS.log_dir + '/params.json'): shutil.copy(FLAGS.params, FLAGS.log_dir + '/params.json') import tensorflow as tf if tf.__version__[0] == '1': tf.compat.v1.enable_eager_execution() print('tf version:', tf.__version__) from basenji import dataset from basenji import seqnn from basenji import trainer # load data diagonal_offset = params_model.get('diagonal_offset', 2) target_crop = params_model.get('target_crop', 0) target_length_crop = params_model[ 'target_length'] - diagonal_offset - 2 * target_crop tlen = target_length_crop * (target_length_crop + 1) // 2 train_data = dataset.SeqDataset(FLAGS.train_data, params_train['batch_size'], params_model['seq_length'], tlen, tf.estimator.ModeKeys.TRAIN) eval_data = dataset.SeqDataset(FLAGS.eval_data, params_train['batch_size'], params_model['seq_length'], tlen, tf.estimator.ModeKeys.EVAL) if params_train.get('num_gpu', 1) == 1: ######################################## # one GPU # initialize model seqnn_model = seqnn.SeqNN(params_model) # restore if FLAGS.restore: seqnn_model.restore(FLAGS.restore, FLAGS.trunk) print('restored weights') if FLAGS.freeze_trunk: seqnn_model.model_trunk.trainable = False print('frozen trunk') # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data) # compile model seqnn_trainer.compile(seqnn_model.model) # train model seqnn_trainer.fit(seqnn_model.model) else: ######################################## # two GPU print('need to update multigpu') '''
def main(_): # I could write some additional code around this to check for common # problems, such as with num_targets. with open(FLAGS.params) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] if params_train.get('use_gpu', 1) == False: os.environ["CUDA_VISIBLE_DEVICES"] = '-1' print(" ") print(" training on CPU ") print(" ") # load data train_data = dataset.SeqDataset(FLAGS.train_data, params_train['batch_size'], params_model['seq_length'], params_model['target_length'], tf.estimator.ModeKeys.TRAIN) eval_data = dataset.SeqDataset(FLAGS.eval_data, params_train['batch_size'], params_model['seq_length'], params_model['target_length'], tf.estimator.ModeKeys.EVAL) if params_train.get('num_gpu', 1) == 1: ######################################## # one GPU # initialize model seqnn_model = seqnn.SeqNN(params_model) # restore if FLAGS.restore: seqnn_model.restore(FLAGS.restore, FLAGS.trunk) # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data) # compile model seqnn_trainer.compile(seqnn_model.model) # train model seqnn_trainer.fit(seqnn_model.model) else: ######################################## # two GPU mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): # initialize model seqnn_model = seqnn.SeqNN(params_model) # restore if FLAGS.restore: seqnn_model.restore(FLAGS.restore, FLAGS.trunk) # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data) # compile model seqnn_trainer.compile(seqnn_model.model, None) # train model seqnn_trainer.fit(seqnn_model.model)