def read_data(learning_file): # 设置训练集 train_data = dataset.SeqDataset(learning_file, state='Train', k=0) # 设置验证集 valid_data = dataset.SeqDataset(learning_file, state='Valid', k=0) ''' print('num_of_trainData:', len(train_data)) print('num_of_validData:', len(valid_data)) print('train positive label sum:', np.sum(np.array(train_data.labels))) print('valid positive label sum:', np.sum(np.array(valid_data.labels))) ''' logger.info('num_of_trainData:' + str(len(train_data))) logger.info('num_of_validData:' + str(len(valid_data))) logger.info('train positive label sum:' + str(np.sum(np.array(train_data.labels)))) logger.info('valid positive label sum:' + str(np.sum(np.array(valid_data.labels)))) logger.info('Positive samples proportion:{:.5f}'.format( (np.sum(np.array(train_data.labels)) + np.sum(np.array(valid_data.labels))) / (len(train_data) + len(valid_data)))) train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True) valid_loader = Data.DataLoader(dataset=valid_data, batch_size=int(len(valid_data) / 50), shuffle=True) return train_data, valid_data, train_loader, valid_loader
with open(data_stats_file) as data_stats_open: data_stats = json.load(data_stats_open) seq_length = data_stats['seq_length'] target_length = data_stats['target_length'] hic_diags = data_stats['diagonal_offset'] target_crop = data_stats['crop_bp'] // data_stats['pool_width'] target_length1 = data_stats['seq_length'] // data_stats['pool_width'] ### load data ### sequences = pd.read_csv(data_dir + '/sequences.bed', sep='\t', names=['chr', 'start', 'stop', 'type']) sequences_test = sequences.iloc[sequences['type'].values == 'test'] sequences_test.reset_index(inplace=True, drop=True) print("going to load test dataset") test_data = dataset.SeqDataset(data_dir, 'test', batch_size=8) # test_targets is a float array with shape # [#regions, #pixels, target #target datasets] # representing log(obs/exp)data, where #pixels # corresponds to the number of entries in the flattened # upper-triangular representation of the matrix # test_inputs are 1-hot encoded arrays with shape # [#regions, 2^20 bp, 4 nucleotides datasets] test_inputs, test_targets = test_data.numpy(return_inputs=True, return_outputs=True) # print(test_targets)
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>' parser = OptionParser(usage) parser.add_option('--ai', dest='accuracy_indexes', help='Comma-separated list of target indexes to make accuracy scatter plots.') parser.add_option('--head', dest='head_i', default=0, type='int', help='Parameters head to test [Default: %default]') parser.add_option('--mc', dest='mc_n', default=0, type='int', help='Monte carlo test iterations [Default: %default]') parser.add_option('--peak','--peaks', dest='peaks', default=False, action='store_true', help='Compute expensive peak accuracy [Default: %default]') parser.add_option('-o', dest='out_dir', default='test_out', help='Output directory for test statistics [Default: %default]') parser.add_option('--rc', dest='rc', default=False, action='store_true', help='Average the fwd and rc predictions [Default: %default]') parser.add_option('--save', dest='save', default=False, action='store_true', help='Save targets and predictions numpy arrays [Default: %default]') parser.add_option('--shifts', dest='shifts', default='0', help='Ensemble prediction shifts [Default: %default]') parser.add_option('-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option('--split', dest='split_label', default='test', help='Dataset split label for eg TFR pattern [Default: %default]') parser.add_option('--tfr', dest='tfr_pattern', default=None, help='TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide parameters, model, and test data HDF5') else: params_file = args[0] model_file = args[1] data_dir = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) # parse shifts to integers options.shifts = [int(shift) for shift in options.shifts.split(',')] ####################################################### # inputs # read targets if options.targets_file is None: options.targets_file = '%s/targets.txt' % data_dir targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t') # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # construct eval data eval_data = dataset.SeqDataset(data_dir, split_label=options.split_label, batch_size=params_train['batch_size'], mode='eval', tfr_pattern=options.tfr_pattern) # initialize model seqnn_model = seqnn.SeqNN(params_model) seqnn_model.restore(model_file, options.head_i) seqnn_model.build_ensemble(options.rc, options.shifts) ####################################################### # evaluate loss_label = params_train.get('loss', 'poisson').lower() spec_weight = params_train.get('spec_weight', 1) loss_fn = trainer.parse_loss(loss_label, spec_weight=spec_weight) # evaluate test_loss, test_metric1, test_metric2 = seqnn_model.evaluate(eval_data, loss=loss_fn) # print summary statistics print('\nTest Loss: %7.5f' % test_loss) if loss_label == 'bce': print('Test AUROC: %7.5f' % test_metric1.mean()) print('Test AUPRC: %7.5f' % test_metric2.mean()) # write target-level statistics targets_acc_df = pd.DataFrame({ 'index': targets_df.index, 'auroc': test_metric1, 'auprc': test_metric2, 'identifier': targets_df.identifier, 'description': targets_df.description }) else: print('Test PearsonR: %7.5f' % test_metric1.mean()) print('Test R2: %7.5f' % test_metric2.mean()) # write target-level statistics targets_acc_df = pd.DataFrame({ 'index': targets_df.index, 'pearsonr': test_metric1, 'r2': test_metric2, 'identifier': targets_df.identifier, 'description': targets_df.description }) targets_acc_df.to_csv('%s/acc.txt'%options.out_dir, sep='\t', index=False, float_format='%.5f') ####################################################### # predict? if options.save or options.peaks or options.accuracy_indexes is not None: # compute predictions test_preds = seqnn_model.predict(eval_data).astype('float16') # read targets test_targets = eval_data.numpy(return_inputs=False) if options.save: preds_h5 = h5py.File('%s/preds.h5' % options.out_dir, 'w') preds_h5.create_dataset('preds', data=test_preds) preds_h5.close() targets_h5 = h5py.File('%s/targets.h5' % options.out_dir, 'w') targets_h5.create_dataset('targets', data=test_targets) targets_h5.close() ####################################################### # peak call accuracy if options.peaks: peaks_out_file = '%s/peaks.txt' % options.out_dir test_peaks(test_preds, test_targets, peaks_out_file) ####################################################### # accuracy plots if options.accuracy_indexes is not None: accuracy_indexes = [int(ti) for ti in options.accuracy_indexes.split(',')] if not os.path.isdir('%s/scatter' % options.out_dir): os.mkdir('%s/scatter' % options.out_dir) if not os.path.isdir('%s/violin' % options.out_dir): os.mkdir('%s/violin' % options.out_dir) if not os.path.isdir('%s/roc' % options.out_dir): os.mkdir('%s/roc' % options.out_dir) if not os.path.isdir('%s/pr' % options.out_dir): os.mkdir('%s/pr' % options.out_dir) for ti in accuracy_indexes: test_targets_ti = test_targets[:, :, ti] ############################################ # scatter # sample every few bins (adjust to plot the # points I want) ds_indexes = np.arange(0, test_preds.shape[1], 8) # subset and flatten test_targets_ti_flat = test_targets_ti[:, ds_indexes].flatten( ).astype('float32') test_preds_ti_flat = test_preds[:, ds_indexes, ti].flatten().astype( 'float32') # take log2 test_targets_ti_log = np.log2(test_targets_ti_flat + 1) test_preds_ti_log = np.log2(test_preds_ti_flat + 1) # plot log2 sns.set(font_scale=1.2, style='ticks') out_pdf = '%s/scatter/t%d.pdf' % (options.out_dir, ti) plots.regplot( test_targets_ti_log, test_preds_ti_log, out_pdf, poly_order=1, alpha=0.3, sample=500, figsize=(6, 6), x_label='log2 Experiment', y_label='log2 Prediction', table=True) ############################################ # violin # call peaks test_targets_ti_lambda = np.mean(test_targets_ti_flat) test_targets_pvals = 1 - poisson.cdf( np.round(test_targets_ti_flat) - 1, mu=test_targets_ti_lambda) test_targets_qvals = np.array(ben_hoch(test_targets_pvals)) test_targets_peaks = test_targets_qvals < 0.01 test_targets_peaks_str = np.where(test_targets_peaks, 'Peak', 'Background') # violin plot sns.set(font_scale=1.3, style='ticks') plt.figure() df = pd.DataFrame({ 'log2 Prediction': np.log2(test_preds_ti_flat + 1), 'Experimental coverage status': test_targets_peaks_str }) ax = sns.violinplot( x='Experimental coverage status', y='log2 Prediction', data=df) ax.grid(True, linestyle=':') plt.savefig('%s/violin/t%d.pdf' % (options.out_dir, ti)) plt.close() # ROC plt.figure() fpr, tpr, _ = roc_curve(test_targets_peaks, test_preds_ti_flat) auroc = roc_auc_score(test_targets_peaks, test_preds_ti_flat) plt.plot( [0, 1], [0, 1], c='black', linewidth=1, linestyle='--', alpha=0.7) plt.plot(fpr, tpr, c='black') ax = plt.gca() ax.set_xlabel('False positive rate') ax.set_ylabel('True positive rate') ax.text( 0.99, 0.02, 'AUROC %.3f' % auroc, horizontalalignment='right') # , fontsize=14) ax.grid(True, linestyle=':') plt.savefig('%s/roc/t%d.pdf' % (options.out_dir, ti)) plt.close() # PR plt.figure() prec, recall, _ = precision_recall_curve(test_targets_peaks, test_preds_ti_flat) auprc = average_precision_score(test_targets_peaks, test_preds_ti_flat) plt.axhline( y=test_targets_peaks.mean(), c='black', linewidth=1, linestyle='--', alpha=0.7) plt.plot(recall, prec, c='black') ax = plt.gca() ax.set_xlabel('Recall') ax.set_ylabel('Precision') ax.text( 0.99, 0.95, 'AUPRC %.3f' % auprc, horizontalalignment='right') # , fontsize=14) ax.grid(True, linestyle=':') plt.savefig('%s/pr/t%d.pdf' % (options.out_dir, ti)) plt.close()
def main(): usage = 'usage: %prog [options] <params_file> <data1_dir> ...' parser = OptionParser(usage) parser.add_option('-k', dest='keras_fit', default=False, action='store_true', help='Train with Keras fit method [Default: %default]') parser.add_option( '-o', dest='out_dir', default='train_out', help='Output directory for test statistics [Default: %default]') parser.add_option( '--restore', dest='restore', help='Restore model and continue training [Default: %default]') parser.add_option('--trunk', dest='trunk', default=False, action='store_true', help='Restore only model trunk [Default: %default]') parser.add_option( '--tfr_train', dest='tfr_train_pattern', default=None, help= 'Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]' ) parser.add_option( '--tfr_eval', dest='tfr_eval_pattern', default=None, help= 'Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]' ) (options, args) = parser.parse_args() if len(args) < 2: parser.error('Must provide parameters and data directory.') else: params_file = args[0] data_dirs = args[1:] if options.keras_fit and len(data_dirs) > 1: print('Cannot use keras fit method with multi-genome training.') exit(1) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if params_file != '%s/params.json' % options.out_dir: shutil.copy(params_file, '%s/params.json' % options.out_dir) # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # read datasets train_data = [] eval_data = [] for data_dir in data_dirs: # load train data train_data.append( dataset.SeqDataset(data_dir, split_label='train', batch_size=params_train['batch_size'], mode='train', tfr_pattern=options.tfr_train_pattern)) # load eval data eval_data.append( dataset.SeqDataset(data_dir, split_label='valid', batch_size=params_train['batch_size'], mode='eval', tfr_pattern=options.tfr_eval_pattern)) if params_train.get('num_gpu', 1) == 1: ######################################## # one GPU # initialize model # print('INITIALIZE MODEL') seqnn_model = seqnn.SeqNN(params_model) # seqnn_model = model_zoo.basenji_model((131072,4), 3) # restore if options.restore: seqnn_model.restore(options.restore, trunk=options.trunk) # initialize trainer seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data, options.out_dir) # compile model seqnn_trainer.compile(seqnn_model) # else: ######################################## # two GPU # strategy = tf.distribute.MirroredStrategy() # # with strategy.scope(): # # if not options.keras_fit: # # distribute data # for di in range(len(data_dirs)): # train_data[di].distribute(strategy) # eval_data[di].distribute(strategy) # # # initialize model # seqnn_model = seqnn.SeqNN(params_model) # # # restore # if options.restore: # seqnn_model.restore(options.restore, options.trunk) # # # initialize trainer # seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data, options.out_dir, # strategy, params_train['num_gpu'], options.keras_fit) # # # compile model # seqnn_trainer.compile(seqnn_model) # train model if options.keras_fit: seqnn_trainer.fit_keras(seqnn_model) else: if len(data_dirs) == 1: seqnn_trainer.fit_tape(seqnn_model) else: seqnn_trainer.fit2(seqnn_model)
def main(): usage = 'usage: %prog [options] <data_dir> <model_name> <output_dir> <params_file>...' parser = OptionParser(usage) parser.add_option( '-b', dest='batch_size', default=4, help='Batch size for the model training [Default: %default]') parser.add_option('-p', dest='patience', default=8, help='Training patience [Default: %default]') parser.add_option('-l', dest='learning_rate', default=0.1, help='Learning rate [Default: %default]') parser.add_option('-m', dest='momentum', default=0.99, help='SGD momentum [Default: %default]') parser.add_option('-e', dest='n_epochs', default=8, help='Training patience [Default: %default]') parser.add_option('--clip_norm', dest='clip_norm', default=1000000, help='Training patience [Default: %default]') (options, args) = parser.parse_args() ########TODO:ADD THE REST OF THE parameters if len(args) < 4: parser.error('Must provide data_dir, model and output directory.') else: data_dir = args[0] model_name = args[1] output_dir = args[2] params_file = args[3] if not os.path.isdir(output_dir): os.mkdir(output_dir) ####LOAD DATA # read model parameters with open(params_file) as params_open: params = json.load(params_open) params_model = params['model'] params_train = params['train'] # read datasets train_data = [] eval_data = [] # load train data train_data.append( dataset.SeqDataset(data_dir, split_label='train', batch_size=params_train['batch_size'], mode='train')) # load eval data eval_data.append( dataset.SeqDataset(data_dir, split_label='valid', batch_size=params_train['batch_size'], mode='eval')) ########################################## # train, valid = load_data(data_dir, options.batch_size) # print(type(valid[0])) # print(len(valid)) # print(valid) if model_name == 'basenji': model = model_zoo.basenji_model((131072, 4), 3) loss_fn = tf.keras.losses.Poisson(reduction=tf.keras.losses.Reduction.NONE) early_stop = tf.keras.callbacks.EarlyStopping( monitor='val_pearsonr', #'val_aupr',# patience=options.patience, verbose=1, mode='max') # early_stop = EarlyStoppingMin(monitor='val_pearsonr', mode='max', verbose=1, # patience=options.patience, min_epoch=1) save_best = tf.keras.callbacks.ModelCheckpoint( '{}/model_best.h5'.format(output_dir), save_best_only=True, mode='max', monitor='val_pearsonr', verbose=1) callbacks = [ early_stop, tf.keras.callbacks.TensorBoard(output_dir), tf.keras.callbacks.ModelCheckpoint('%s/model_check.h5' % output_dir), save_best ] # fit model num_targets = model.output_shape[-1] print('num_targets ', num_targets) model_metrics = [metrics.PearsonR(num_targets), metrics.R2(num_targets)] optimizer = tf.keras.optimizers.SGD(learning_rate=options.learning_rate, momentum=options.momentum, clipnorm=options.clip_norm) model.compile(loss=loss_fn, optimizer=optimizer, metrics=model_metrics) model.fit(train, epochs=options.n_epochs, callbacks=callbacks, validation_data=valid)