def main(train_file, val_file, kmer, model_dir): input_train_seq, input_train_err, label = ut.get_data_jm(train_file, kmer) input_val_seq, input_val_err, vy = ut.get_data_jm(val_file, kmer) # import pdb;pdb.set_trace() cls = autosklearn.classification.AutoSklearnClassifier() cls.fit([input_train_seq, input_train_err], label) import pdb pdb.set_trace() predictions = cls.predict([input_val_seq, input_val_err]) import pdb pdb.set_trace()
def train_jm(train_file, val_file, log_dir, model_dir, batch_size, kmer, epochs, checkpoint_path=None): strategy = tf.distribute.MirroredStrategy() #print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) #BATCH_SIZE_PER_REPLICA = 64 #BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync log_dir += datetime.datetime.now().strftime("%Y%m%d-%H%M%S_jm") model_dir += datetime.datetime.now().strftime("%Y%m%d-%H%M%S_jm_model") input_train_seq, input_train_err, label = ut.get_data_jm(train_file, kmer) input_val_seq, input_val_err, vy = ut.get_data_jm(val_file, kmer) ## train model with strategy.scope(): model = JointNN() model.compile( loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.00125), metrics=['accuracy']) input_shape = ([(None, kmer, 9), (None, kmer, 9)]) model.build(input_shape) print(model.summary()) if checkpoint_path: model.load_weights(checkpoint_path) tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) callback_list = [ tensorboard_callback, tf.keras.callbacks.ModelCheckpoint(filepath=model_dir, monitor='val_accuracy', mode='max', save_best_only=True, save_weights_only=False) ] model.fit([input_train_seq, input_train_err], label, batch_size=batch_size, epochs=epochs, callbacks=callback_list, validation_data=([input_val_seq, input_val_err], vy)) model.save(model_dir) return None
def joint_read_calling(test_file, kmer, trained_model, model_type): data_seq, data_err, labels, data_id = ut.get_data_jm(test_file, kmer, get_id=True) pred, inferred = test_single_read([data_seq, data_err], trained_model, model_type, kmer) return pred, inferred, data_id
def main(features, model_type, model, kmer, output): if model_type == 'seq': data_seq, labels = ut.get_data_sequence(features, kmer, err_features) acc, pred, inferred = acc_test_single(data_seq, labels, model) elif model_type == 'err': data_err, labels = ut.get_data_errors(features, kmer) acc, pred, inferred = acc_test_single(data_err, labels, model) elif model_type == 'joint': data_seq, data_err, labels = ut.get_data_jm(features, kmer) acc, pred, inferred = acc_test_single([data_seq, data_err], labels, model) mean, median, std, rang, len_sig = get_ind_feat_seq( data_seq, labels, pred, inferred) quality, mismatch, deletion, insertion = get_ind_feat_err( data_err, labels, pred, inferred) to_plot = [(mean, 'mean.pdf', (-3, 3)), (median, 'median.pdf', (-3, 3)), \ (std, 'std.pdf', (-3, 3)), (rang, 'rang.pdf', (-0.5, 3)), (len_sig, 'len_sig.pdf', (0, 50)), \ (quality, 'quality.pdf', (0, 20))] plot_err = [(mismatch, 'mismatch.pdf'), (deletion, \ 'deletion.pdf'), (insertion, 'insertion.pdf')] for el in to_plot: pl.feature_exploration_plots(el[0], kmer, output, el[1], el[2]) pl.do_PCA(el[0], kmer, output, el[1])
def process_chunk(features, tmp_folder, output, model): df = pd.read_csv(os.path.join(tmp_folder, features), sep='\t', names=names_all) df = df[(df['pos'] >= 1000000) & (df['pos'] <= 2000000)] counter = round(random(), 10) if df.shape[0] > 0: ut.preprocess_combined(df, output, 'all_{}'.format(counter), 'test') test_file = os.path.join(output, 'test_all_{}.h5'.format(counter)) data_seq, data_err, labels = ut.get_data_jm(test_file, 17) acc, pred, inferred = acc_test_single([data_seq, data_err], labels, model) info = Counter(df['methyl_label']) print(acc, info)
def call_mods_user(model_type, test_file, trained_model, kmer, output, err_features=False, pos_based=False, pred_type='min_max', figures=False): ## process text file input if test_file.rsplit('.')[-1] == 'tsv': print("processing tsv file, this might take a while...") test = pd.read_csv(test_file, sep='\t', names=pr.names_all) ut.preprocess_combined(test, os.path.dirname(test_file), '', 'test_all') test_file = os.path.join(os.path.dirname(test_file), 'test_all.h5') ## read-based calling if model_type == 'seq': data_seq, labels = ut.get_data_sequence(test_file, kmer, err_features) pred, inferred = test_single_read(data_seq, trained_model) elif model_type == 'err': data_err, labels = ut.get_data_errors(test_file, kmer) pred, inferred = test_single_read(data_err, trained_model) elif model_type == 'joint': data_seq, data_err, labels, data_id = ut.get_data_jm(test_file, kmer, get_id=True) pred, inferred = test_single_read([data_seq, data_err], trained_model, labels) # ut.save_probs_user(pred, inferred, output) ## position-based calling # TODO store position info in test file if pos_based: if 'data_id' in locals(): test = build_test_df(data_id) #TODO DELETE test['methyl_label'] = labels #TODO output proper df with all the information. put columns at different thresholds as well as the min max for testing all_preds = do_per_position_analysis(test, pred, inferred, output, pred_type) all_preds.to_csv(os.path.join(output, 'human_chr1_table.tsv'), sep='\t', index=None) # test.to_csv(os.path.join(output, '60_mod_40_unmod_pred_table.tsv'), sep='\t', index=None) uu = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_005'], average='binary') xx = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_01'], average='binary') yy = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_02'], average='binary') zz = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_03'], average='binary') ww = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_04'], average='binary') vv = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_min_max'], average='binary') pp = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_posterior'], average='binary') pp = precision_recall_fscore_support(all_preds['meth_label'], all_preds['pred_beta'], average='binary') # print(xx,yy,zz,ww,vv,pp) import pdb pdb.set_trace()