def __init__(self, weight_path, win_size, n_feat, n_class, drop, _blocks, fc_size): import tensorflow as tf from keras import backend as K from model import cnv_net K.clear_session() config = tf.ConfigProto() # config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) model_name = 'cnvnet' self.model = cnv_net(win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=None, m_name=model_name) self.model.load_weights(weight_path)
def __init__(self, weight_path, win_size, n_feat, n_class, drop, _blocks, fc_size, l2ratio, temp, filters, kernel_size): import tensorflow as tf from keras import backend as K from model import cnv_net K.clear_session() config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.2 # (2core)0.3->0.15(4core) sess = tf.Session(config=config) K.set_session(sess) model_name = 'cnvnet' self.model = cnv_net(win_size, n_feat, n_class, filters=filters, kernel_size=kernel_size, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=l2ratio, temperature=temp, m_name=model_name) self.model.load_weights(weight_path)
def __init__(self, weight_path, win_size, n_feat, n_class, drop, _blocks, fc_size, n_proc=10): import tensorflow as tf from keras import backend as K from model import cnv_net K.clear_session() config = tf.ConfigProto(device_count={"CPU": n_proc}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) sess = tf.Session(config=config) K.set_session(sess) model_name = 'cnvnet' self.model = cnv_net(win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=None, m_name=model_name) self.model.load_weights(weight_path)
def __load_model(self, model_weight_fn): logger.info('loading model weight...') assert os.path.exists(model_weight_fn) model_weight_name = os.path.splitext(os.path.basename(model_weight_fn))[0] model_in_lst = model_weight_name.split('-') model_name = model_in_lst[1] model_params_lst = re.findall(r"[-+]?\d*\.\d+|\d+", model_in_lst[0]) logging.info('model name: {0}, model params(batch, epoch, lr, drop, fc, block, win): {1}'.format( model_name, model_params_lst)) assert len(model_params_lst) >= 6 batch_size = int(model_params_lst[0]) drop = float(model_params_lst[3]) fc_size = int(model_params_lst[4]) blocks = (int(x) for x in model_params_lst[5]) win_size = int(model_params_lst[6]) model = None if model_name == 'cnvnet': model = cnv_net(win_size, self.n_features, self.n_classes, drop=drop, blocks=blocks, fc_size=fc_size) model.load_weights(model_weight_fn) return model, batch_size
def online_call(online_seg_data_root_dir, online_call_out_root_dir, model_in_root_dir, n_win_size=1000, n_feat=13, n_class=3, epochs=64, batch=1024, learn_rate=0.001, drop=0.5, fc_size=64, blocks='4_4_3', step_size=200, sample_id='NA12878', chr_id='1', min_ratio=0.1, seg_range='a', cost_mat='221', n_proc=10): # get model name _blocks = (int(x) for x in blocks.split('_')) def out_name(): str_blocks = [str(x) for x in blocks.split('_')] str_blk = ''.join(str_blocks) return 'b{0}_e{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_blk{5}_win{6}_cw{7}'.format( batch, epochs, learn_rate, drop, fc_size, str_blk, n_win_size, cost_mat) # set enviornment time.sleep(10) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" logger.info('waiting available gpu device...') while True: gpu_id_lst = GPU.getFirstAvailable(order='random', maxMemory=0.001, maxLoad=0.001, attempts=50, interval=60) if len(gpu_id_lst) > 0: break logger.info('processing on device id {}...'.format(gpu_id_lst[0])) os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id_lst[0]) logger.info('loading model...') K.clear_session() config = tf.ConfigProto(device_count={"CPU": n_proc}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) model_name = 'cnvnet' model = cnv_net(n_win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=None, m_name=model_name) model_weight_fn = os.path.join(model_in_root_dir, out_name() + '-' + model_name + '.hdf5') if not os.path.exists(model_weight_fn): raise FileNotFoundError( 'model weight file not found. {}'.format(model_weight_fn)) model.load_weights(model_weight_fn) # load data # default: online_seg_data_root_dir = '/zfssz2/ST_MCHRI/BIGDATA/PROJECT/NIPT_CNV/f_cnv_out/online' online_seg_data_subroot_dir = os.path.join(online_seg_data_root_dir, sample_id + '/data') if not os.path.isdir(online_seg_data_subroot_dir): raise FileNotFoundError( 'No segments generated for sample {}, chr {}'.format( sample_id, chr_id)) part_fname = 'win{0}_step{1}_r{2:.2f}_chr{3}_seg_'.format( n_win_size, step_size, min_ratio, chr_id) # create out dir # default: online_call_out_root_dir = '/zfssz2/ST_MCHRI/BIGDATA/PROJECT/NIPT_CNV/f_cnv_out/online' online_call_out_subroot_dir = os.path.join(online_call_out_root_dir, sample_id + '/cnv_call') if not os.path.isdir(online_call_out_subroot_dir): os.mkdir(online_call_out_subroot_dir) call_out_fn = os.path.join( online_call_out_subroot_dir, 'win{0}_step{1}_r{2:.2f}_chr{3}-cnv-call-'.format( n_win_size, step_size, min_ratio, chr_id)) if seg_range != 'a': # if not whole chr part_fname = part_fname + 'p-' + seg_range + '_' call_out_fn = call_out_fn + 'p-' + seg_range + '-' else: part_fname = part_fname + 'a_' call_out_fn = call_out_fn + 'a-' call_out_fn = call_out_fn + 'result.csv' if os.path.exists(call_out_fn): os.remove(call_out_fn) gap_h5_fn = os.path.join(online_seg_data_subroot_dir, part_fname + 'gap.h5') gap_pred = None if os.path.exists(gap_h5_fn): with h5py.File(gap_h5_fn, 'r') as gap_fn_read: gap_obj = gap_fn_read.get('gap') if gap_obj: gap_result = gap_obj.value tmp_arr = np.full((gap_result.shape[0], 4), -1) # tmp_arr[:] = np.nan gap_pred = np.concatenate((gap_result, tmp_arr), axis=1) del tmp_arr unpred_h5_fn_list = glob.glob( os.path.join(online_seg_data_subroot_dir, part_fname + 'unpred_*')) f_unpred_arr = None for i_unpred_fn in unpred_h5_fn_list: with h5py.File(i_unpred_fn, 'r') as i_uppred_fn_read: i_unpred_meta = i_uppred_fn_read.get('unpred_meta').value i_unpred_meta_arr = np.array(i_unpred_meta) tmp_arr = np.full((i_unpred_meta_arr.shape[0], 4), -1) # tmp_arr[:] = np.nan if f_unpred_arr is None: f_unpred_arr = np.concatenate((i_unpred_meta_arr, tmp_arr), axis=1) else: unpred_arr = np.concatenate((i_unpred_meta_arr, tmp_arr), axis=1) f_unpred_arr = np.concatenate((f_unpred_arr, unpred_arr), axis=0) pred_h5_fn_list = glob.glob( os.path.join(online_seg_data_subroot_dir, part_fname + 'pred_*')) logger.info('calling cnv...') f_pred_res = None for i, i_fn in enumerate(pred_h5_fn_list): logger.info('processing {}/{}:{}'.format(i + 1, len(pred_h5_fn_list), i_fn)) with h5py.File(i_fn, 'r') as i_fn_read: i_fn_pred_meta = i_fn_read.get('pred_meta').value i_fn_pred_feat = i_fn_read.get('pred_feat').value i_feat_arr = np.array(i_fn_pred_feat) del i_fn_pred_feat ypred = model.predict_on_batch(i_feat_arr) ypred_l = np.argmax(ypred, axis=1) assert len(i_fn_pred_meta) == ypred.shape[0] if f_pred_res is None: f_pred_res = np.concatenate( (np.array(i_fn_pred_meta), ypred, ypred_l[:, np.newaxis]), axis=1) else: i_pred_res = np.concatenate( (np.array(i_fn_pred_meta), ypred, ypred_l[:, np.newaxis]), axis=1) f_pred_res = np.concatenate((f_pred_res, i_pred_res), axis=0) logger.info('combining and sorting results...') # check if gap_pred is not None and f_unpred_arr is not None and f_pred_res is not None: whl_cnv_re = np.concatenate((gap_pred, f_unpred_arr, f_pred_res), axis=0) elif gap_pred is not None and f_unpred_arr is not None and f_pred_res is None: whl_cnv_re = np.concatenate((gap_pred, f_unpred_arr), axis=0) elif gap_pred is not None and f_pred_res is not None and f_unpred_arr is None: whl_cnv_re = np.concatenate((gap_pred, f_pred_res), axis=0) elif gap_pred is None and f_unpred_arr is not None and f_pred_res is not None: whl_cnv_re = np.concatenate((f_unpred_arr, f_pred_res), axis=0) elif f_pred_res is None and f_unpred_arr is None and gap_pred is not None: whl_cnv_re = gap_pred.copy() elif gap_pred is None and f_unpred_arr is None and f_pred_res is not None: whl_cnv_re = f_pred_res.copy() elif f_pred_res is None and gap_pred is None and f_unpred_arr is not None: whl_cnv_re = f_unpred_arr.copy() del gap_pred, f_unpred_arr, f_pred_res ind = np.argsort(whl_cnv_re[:, 0]) whl_cnv_re = whl_cnv_re[ind] out_df = pd.DataFrame(data=whl_cnv_re, columns=[ 'seg_s', 'seg_e', 'seg_l', 'indicator', 'p_neu', 'p_del', 'p_dup', 'pred_l' ]) out_df[['seg_s', 'seg_e', 'seg_l', 'indicator', 'pred_l' ]] = out_df[['seg_s', 'seg_e', 'seg_l', 'indicator', 'pred_l']].astype(int) out_df.to_csv(call_out_fn, index=False, sep='\t') logger.info( 'Done, online cnv call results saved into {}'.format(call_out_fn))
def train(training_generator, validation_data, model_root_dir, epochs=50, batch=256, learn_rate=0.001, drop=0.5, fc_size=128, blocks='4_4_3', n_gpu=4, n_cpu=10): """ epoch from 50 -> 60 :param x_train: :param y_train: :param model_root_dir: :param epochs: :param batch: :param learn_rate: :param drop: :param fc_size: :param blocks: :param n_gpu: :return: """ # n_example, n_win_size, n_feat = x_train.shape n_win_size =1000 n_class = 3 n_feat=13 # k_x_train1, k_x_val1, k_y_train1, k_y_val1 = train_test_split(x_train, y_train, test_size=0.2, random_state=123) # nb_trains = k_x_train1.shape[0] // batch # nb_examples = batch * nb_trains # k_x_train = k_x_train1[:nb_examples] # k_y_train = k_y_train1[:nb_examples] # k_x_val = np.concatenate((k_x_val1, k_x_train1[nb_examples:]), axis=0) # k_y_val = np.concatenate((k_y_val1, k_y_train1[nb_examples:]), axis=0) # del k_x_train1, k_x_val1, k_y_train1, k_y_val1 gc.collect() def out_name(): str_blocks = [str(x) for x in blocks.split('_')] str_blk = ''.join(str_blocks) return 'b{0}_e{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_blk{5}_win{6}'.format(batch, epochs, learn_rate, drop, fc_size, str_blk, n_win_size) _blocks = (int(x) for x in blocks.split('_')) K.clear_session() # config = tf.ConfigProto() config = tf.ConfigProto(device_count={'GPU': n_gpu, 'CPU': n_cpu}) config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) model_name = 'cnvnet' base_model = cnv_net(n_win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=None, m_name=model_name) # base_model = cnv_net_seq(n_win_size, n_feat, n_class) if n_gpu > 1: model = multi_gpu_model(base_model, n_gpu) else: model = base_model _model_dir = os.path.join(model_root_dir, 'final_model/model_weight') if not os.path.isdir(_model_dir): os.makedirs(_model_dir) _tb_dir = os.path.join(model_root_dir, 'final_model/tb_logs') if not os.path.isdir(_tb_dir): os.makedirs(_tb_dir) _csvlogger_dir = os.path.join(model_root_dir, 'final_model/model_csvlogger') if not os.path.isdir(_csvlogger_dir): os.makedirs(_csvlogger_dir) model_fn = os.path.join(_model_dir, '{}-{}.hdf5'.format(out_name(), model_name)) tensorboard_fn = os.path.join(_tb_dir, '{}-{}'.format(out_name(), model_name)) csvlogger_fn = os.path.join(_csvlogger_dir, '{}-{}'.format(out_name(), model_name)) callbacks = [ # Early stopping definition EarlyStopping(monitor='val_acc', patience=5, verbose=1), # Decrease learning rate by 0.5 factor AdvancedLearnignRateScheduler(monitor='val_acc', patience=1, verbose=1, mode='auto', decayRatio=0.5), # Saving best model MultiGPUCheckpointCallback(model_fn, base_model=base_model, monitor='val_acc', save_best_only=True, verbose=1, save_weights_only=True), TensorBoard(tensorboard_fn, batch_size=batch, histogram_freq=2), CSVLogger(csvlogger_fn) ] model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate), loss='categorical_crossentropy', metrics=['accuracy']) # model.fit(k_x_train, k_y_train, validation_data=(k_x_val, k_y_val), # epochs=epochs, batch_size=batch, callbacks=callbacks) model.fit_generator(generator=training_generator, validation_data=validation_data, epochs=5, use_multiprocessing=True, workers=10)
def evaluate(model_root_dir, total_samples_ls_fn, test_ratio, win_size, min_size, min_r, min_f_deldup, min_f_neu, is_norm, model_weight_fn): xtest = [] ytest = [] # load test data set test_set_fn = os.path.join( model_root_dir, 'test{:.2f}_win{}_minsize{}_dataset.npz'.format( test_ratio, win_size, min_size)) if os.path.exists(test_set_fn): with np.load(test_set_fn) as test_set: xtest = test_set['x_test'] ytest = test_set['y_test'] else: from keras.utils import to_categorical _, test_ids = get_train_test_ids(model_root_dir, total_samples_ls_fn, test_ratio) x_test = [] y_test_ = [] for ix, test_id in enumerate(test_ids): x, y = get_data(test_id, model_root_dir, win_size, min_r, min_f_deldup, min_f_neu, is_norm) if (not (x is None)) and (not (y is None)): if len(x_test) == 0: x_test = x y_test_ = y else: x_test = np.concatenate((x_test, x), axis=0) y_test_ = np.concatenate((y_test_, y)) y_ = [1 if x == 'DEL' else 2 if x == 'DUP' else 0 for x in y_test_] y_test = to_categorical(y_) logger.info('x test: {}, y test: {}'.format(x_test.shape, y_test.shape)) np.savez_compressed(test_set_fn, x_test=x_test, y_test=y_test) xtest = x_test ytest = y_test n_example, n_win_size, n_feat = xtest.shape n_class = ytest.shape[1] model_weight_name = os.path.splitext(os.path.basename(model_weight_fn))[0] model_in_lst = model_weight_name.split('-') model_name = model_in_lst[1] model_params_lst = re.findall(r"[-+]?\d*\.\d+|\d+", model_in_lst[0]) logging.info( 'model name: {0}, model params(batch, epoch, lr, drop, fc, block, win): {1}' .format(model_name, model_params_lst)) assert len(model_params_lst) >= 6 drop = float(model_params_lst[3]) fc_size = int(model_params_lst[4]) blocks = (int(x) for x in model_params_lst[5]) model = None if model_name == 'cnvnet': model = cnv_net(n_win_size, n_feat, n_class, drop=drop, blocks=blocks, fc_size=fc_size) model.load_weights(model_weight_fn) logging.info("finished loading model!") ypred = model.predict(xtest) test_pred_fn = os.path.join( model_root_dir, 'test_out/test{:.2f}_win{}_minsize{}_pred.npz'.format( test_ratio, win_size, min_size)) np.savez_compressed(test_pred_fn, ypred=ypred, ytrue=ytest) logger.info('Predict Done, result saved at {}'.format(test_pred_fn))
def evaluate(test_sample_fn, test_out_root_dir, model_in_root_dir, n_win_size=1000, n_feat=13, n_class=3, epochs=64, batch=1024, learn_rate=0.001, drop=0.5, fc_size=64, blocks='4_4_3', n_cpu=20, class_weights=None): # get model name _blocks = tuple(int(x) for x in blocks.split('_')) def out_name(): str_blocks = [str(x) for x in blocks.split('_')] str_blk = ''.join(str_blocks) if class_weights is not None: class_weight_label = ''.join(np.array([class_weights[1], class_weights[2], class_weights[0]]).astype(str)) else: class_weight_label = '111' return 'b{0}_e{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_blk{5}_win{6}_cw{7}'.format(batch, epochs, learn_rate, drop, fc_size, str_blk, n_win_size, class_weight_label) logger.info('loading model... ') K.clear_session() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) model_name = 'cnvnet' model = cnv_net(n_win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=None, m_name=model_name) model_weight_fn = os.path.join(model_in_root_dir, out_name() + '-' + model_name + '.hdf5') if not os.path.exists(model_weight_fn): raise FileNotFoundError('model weight file not found. {}'.format(model_weight_fn)) model.load_weights(model_weight_fn) test_out_dir = os.path.join(test_out_root_dir, 'test_out') if not os.path.isdir(test_out_dir): os.mkdir(test_out_dir) test_out_fn = os.path.join(test_out_dir, out_name() + '-offline-test.h5') test_sample_df = pd.read_csv(test_sample_fn, sep='\t') # slow when using generator test_samples_fn_arr = test_sample_df[['f_name']].values test_samples_true_arr = test_sample_df['cnv_type_encode'].values test_batch_generator = CNVDataGenerator(test_samples_fn_arr, batch, win_size=n_win_size, n_feat=n_feat, n_classes=n_class, shuffle=False, pred_gen=True) # use predict_generator will produce only batch_size * n outputs ypred = model.predict_generator(test_batch_generator, verbose=1, use_multiprocessing=True, workers=n_cpu, max_queue_size=64) logger.info('writing predicted results into file...') with h5py.File(test_out_fn, 'w') as test_out: test_out.create_dataset('ypred', data=ypred) test_out.create_dataset('ytrue', data=test_samples_true_arr) logger.info('Predict Done, result saved at {}'.format(test_out_fn))
def train3(target_train_h5_fn, target_val_h5_fn, model_out_root_dir, n_win_size=5000, n_feat=13, n_class=3, filters=32, kernel_size=16, epochs=64, batch=1024, learn_rate=0.001, drop=0.5, fc_size=64, blocks='4_4_3', l2r=1e-4, temperature=5, lbl_smt_frac=0.1, pw=0, n_gpu=4, n_cpu=20): _blocks = tuple(int(x) for x in blocks.split('_')) def out_name(): str_blocks = [str(x) for x in blocks.split('_')] str_blk = ''.join(str_blocks) return 'b{0}_ep{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_' \ 'blk{5}_win{6}_l2r{7}_temp{8}_lblsmt{9}_pw{10}'.format(batch, epochs, learn_rate, drop, fc_size, str_blk, n_win_size, str(l2r), temperature, int( lbl_smt_frac) if lbl_smt_frac == 0 else lbl_smt_frac, pw) K.clear_session() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) model_name = 'cnvnet' base_model = cnv_net(n_win_size, n_feat, n_class, filters=filters, kernel_size=kernel_size, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=l2r, temperature=temperature, m_name=model_name) # base_model = cnv_net_seq(n_win_size, n_feat, n_class) if n_gpu > 1: model = multi_gpu_model(base_model, n_gpu) else: model = base_model _model_dir = os.path.join(model_out_root_dir, 'model_weight') if not os.path.isdir(_model_dir): os.mkdir(_model_dir) _tb_dir = os.path.join(model_out_root_dir, 'tb_logs') if not os.path.isdir(_tb_dir): os.mkdir(_tb_dir) _csvlogger_dir = os.path.join(model_out_root_dir, 'model_csvlogger') if not os.path.isdir(_csvlogger_dir): os.mkdir(_csvlogger_dir) model_fn = os.path.join(_model_dir, '{}-{}.hdf5'.format(out_name(), model_name)) if os.path.exists(model_fn): os.remove(model_fn) tensorboard_fn = os.path.join(_tb_dir, '{}-{}'.format(out_name(), model_name)) if os.path.isdir(tensorboard_fn): shutil.rmtree(tensorboard_fn, ignore_errors=True) csvlogger_fn = os.path.join(_csvlogger_dir, '{}-{}'.format(out_name(), model_name)) if os.path.exists(csvlogger_fn): os.remove(csvlogger_fn) with h5py.File(target_train_h5_fn, 'r') as train_h5: train_len = train_h5['x'].shape[0] with h5py.File(target_val_h5_fn, 'r') as val_h5: val_len = val_h5['x'].shape[0] callbacks = [ # Early stopping definition EarlyStopping(monitor='val_fmeasure', mode='max', patience=10, verbose=1), # Decrease learning rate by 0.5 factor AdvancedLearnignRateScheduler(monitor='val_fmeasure', patience=1, verbose=1, mode='max', decayRatio=0.5), # CyclicLR(mode='triangular', base_lr=learn_rate, max_lr=0.1, step_size=6 * (train_len // batch)), # Saving best model MultiGPUCheckpointCallback(model_fn, base_model=base_model, monitor='val_fmeasure', mode='max', save_best_only=True, verbose=1, save_weights_only=True), # histogram_freq=0 because # ValueError: If printing histograms, validation_data must be provided, and cannot be a generator # set histogram_freq=0 to solve the problem # TensorBoard(tensorboard_fn, batch_size=batch, histogram_freq=0), CSVLogger(csvlogger_fn) ] # fine tune the cost function so that missclassification is weighted some how p_misclass_weight = np.ones((n_class, n_class)) # there could be improved # penalizing FN p_misclass_weight[:, 0] = 2.0 # penalizing FP p_misclass_weight[0, :] = 2.0 p_misclass_weight[1, 2] = 2.0 p_misclass_weight[2, 1] = 2.0 p_misclass_weight[0, 0] = 1.0 p_misclass_weight2 = np.ones((n_class, n_class)) # there could be improved # penalizing FN p_misclass_weight2[:, 0] = 2.0 # penalizing FP p_misclass_weight2[0, :] = 1.5 p_misclass_weight2[0, 0] = 1.0 misclass_dict = {1: p_misclass_weight, 2: p_misclass_weight2} # using weight loss funtion # https://github.com/keras-team/keras/issues/2115 if pw > 0: custom_loss = partial(w_categorical_crossentropy, weights=misclass_dict[pw]) custom_loss.__name__ = 'w_categorical_crossentropy' elif pw == 0: custom_loss = 'categorical_crossentropy' elif pw == -1: custom_loss = focal_loss(alpha=1.0) elif pw == -2: custom_loss = combine_tversky_focal_loss(alpha_t=0.2, beta_t=0.8, gamma_f=2., alpha_f=1.) elif pw == -3: custom_loss = combine_weighted_ce_focal_loss(weights=misclass_dict[1], gamma_f=2., alpha_f=1.) elif pw == -4: custom_loss = combine_tversky_focal_loss(alpha_t=1, beta_t=1, gamma_f=2., alpha_f=1.) elif pw == -5: custom_loss = tversky_loss(alpha=0.5, beta=0.5) elif pw == -6: custom_loss = tversky_loss(alpha=0.8, beta=0.2) elif pw == -7: custom_loss = tversky_loss(alpha=0.1, beta=0.9) elif pw == -8: custom_loss = combine_tversky_focal_loss(alpha_t=0.8, beta_t=0.2, gamma_f=2., alpha_f=1.) custom_loss.__name__ = 'tversky_focal_loss' elif pw == -9: custom_loss = combine_tversky_focal_loss(alpha_t=4.0, beta_t=2.0, gamma_f=2., alpha_f=1.) elif pw == -10: custom_loss = combine_tversky_focal_loss(alpha_t=0.8, beta_t=0.2, gamma_f=4., alpha_f=1.) elif pw == -11: custom_loss = combine_tversky_focal_loss(alpha_t=0.8, beta_t=0.2, gamma_f=1.5, alpha_f=1.) elif pw == -12: custom_loss = combine_tversky_focal_loss(alpha_t=0.7, beta_t=0.3, gamma_f=2, alpha_f=1.) elif pw == -13: custom_loss = combine_tversky_focal_loss(alpha_t=0.8, beta_t=0.2, gamma_f=2.5, alpha_f=1.) elif pw == -14: custom_loss = combine_tversky_focal_loss(alpha_t=0.8, beta_t=0.2, gamma_f=3, alpha_f=1.) elif pw == -15: # call custom_loss = combine_tversky_focal_loss(alpha_t=0.9, beta_t=0.1, gamma_f=2.0, alpha_f=1.) elif pw == -16: custom_loss = combine_tversky_focal_loss(alpha_t=0.6, beta_t=0.4, gamma_f=2.0, alpha_f=1.) elif pw == -17: custom_loss = combine_tversky_focal_loss(alpha_t=0.7, beta_t=0.3, gamma_f=1.5, alpha_f=1.) elif pw == -18: custom_loss = combine_tversky_focal_loss(alpha_t=0.6, beta_t=0.4, gamma_f=1.5, alpha_f=1.) model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate), loss=custom_loss, metrics=['accuracy', precision, recall, fmeasure]) # model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate), # loss='categorical_crossentropy', # metrics=['accuracy']) training_batch_generator = CNVDataGenerator3(target_train_h5_fn, train_len, batch, win_size=n_win_size, n_feat=n_feat, smooth_factor=lbl_smt_frac) val_batch_generator = CNVDataGenerator3(target_val_h5_fn, val_len, batch, win_size=n_win_size, n_feat=n_feat, smooth_factor=lbl_smt_frac) # fit_generator model.fit_generator( generator=training_batch_generator, steps_per_epoch=(train_len // batch), epochs=epochs, verbose=1, # class_weight=None, # class_weight=class_weights callbacks=callbacks, validation_data=val_batch_generator, validation_steps=(val_len // batch), use_multiprocessing=True, # True workers=n_cpu, # n_cpu max_queue_size=(n_cpu + 10)) # 80
def gp_fitness(batch_size, epoch, learn_rate, drop, fc_size, blocks): batch_size = int(batch_size) epoch = int(epoch) learn_rate = float(learn_rate) drop = float(drop) fc_size = int(fc_size) print('batch_size: {}'.format(batch_size)) print('epoch: {}'.format(epoch)) print('learn rate: {0:.3f}'.format(learn_rate)) print('drop ratio: {0:.1f}'.format(drop)) print('fc size: {}'.format(fc_size)) print('blocks: {}'.format(blocks)) _blocks = (int(x) for x in blocks.split('_')) tmp_out_name = out_name(batch_size, epoch, learn_rate, drop, fc_size, blocks) val_acc_arr = [] # for i, (train_idx, val_idx) in enumerate(skf.split(x_train, y_train_labels)): # ix_train1, ix_val1 = x_train[train_idx], x_train[val_idx] # iy_train1, iy_val1 = y_train[train_idx], y_train[val_idx] for i in range(1): ix_train1, ix_val1, iy_train1, iy_val1 = train_test_split( x_train, y_train, test_size=0.2, random_state=123) nb_trains = ix_train1.shape[0] // batch_size nb_examples = batch_size * nb_trains k_x_train = ix_train1[:nb_examples] k_y_train = iy_train1[:nb_examples] k_x_val = np.concatenate((ix_val1, ix_train1[nb_examples:]), axis=0) k_y_val = np.concatenate((iy_val1, iy_train1[nb_examples:]), axis=0) del ix_train1, ix_val1, iy_train1, iy_val1 # gc.collect() model_fn = os.path.join(_model_dir, '{0}-k{1}.hdf5'.format(tmp_out_name, i)) tensorboard_fn = os.path.join( _tb_dir, '{0}-tb_k{1}'.format(tmp_out_name, i)) csvlogger_fn = os.path.join( _csvlogger_dir, '{0}-csvlogger_k{1}'.format(tmp_out_name, i)) base_model = cnv_net(n_win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size, kernel_regular_l2=None, m_name=tmp_out_name) if n_gpu > 1: model = multi_gpu_model(base_model, n_gpu) else: model = base_model callbacks = [ # Early stopping definition EarlyStopping(monitor='val_acc', patience=5, verbose=1), # Decrease learning rate AdvancedLearnignRateScheduler(monitor='val_acc', patience=1, verbose=1, mode='auto', decayRatio=0.5), # Saving best model # MultiGPUCheckpointCallback(model_fn, base_model=base_model, monitor='val_acc', # save_best_only=True, verbose=1, save_weights_only=True), # TensorBoard(tensorboard_fn, batch_size=batch_size, histogram_freq=2), CSVLogger(csvlogger_fn) ] model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate), loss='categorical_crossentropy', metrics=['accuracy']) hist = model.fit(k_x_train, k_y_train, validation_data=(k_x_val, k_y_val), epochs=epoch, batch_size=batch_size, callbacks=callbacks) i_val_acc = hist.history['val_acc'][-1] print("Accuracy: {0:.6%}".format(i_val_acc)) val_acc_arr.append(i_val_acc) del model del k_x_train, k_y_train, k_x_val, k_y_val K.clear_session() gc.collect() i_config = tf.ConfigProto() i_config.gpu_options.allow_growth = True # i_config = tf.ConfigProto(device_count={'GPU': n_gpu, 'CPU': n_cpu}) i_sess = tf.Session(config=i_config) K.set_session(i_sess) cv_mean_val_acc = np.mean(val_acc_arr) global best_accuracy if cv_mean_val_acc > best_accuracy: best_accuracy = cv_mean_val_acc return -cv_mean_val_acc
def gp_fitness(batch_size, epoch, learn_rate, learn_rate_decay, filters, drop, fc_size, blocks): print('batch_size: {}'.format(batch_size)) print('epoch: {}'.format(epoch)) print('learn rate: {0:.3f}'.format(learn_rate)) print('learn rate decay: {0:.1f}'.format(learn_rate_decay)) print('filters: {}'.format(filters)) print('drop ratio: {0:.1f}'.format(drop)) print('fc size: {}'.format(fc_size)) print('blocks: {}'.format(blocks)) tmp_out_name = out_name(batch_size, epoch, learn_rate, learn_rate_decay, filters, drop, fc_size, blocks) val_acc_arr = [] # for i, (train_idx, val_idx) in enumerate(skf.split(x_train, y_train_labels)): # ix_train1, ix_val1 = x_train[train_idx], x_train[val_idx] # iy_train1, iy_val1 = y_train[train_idx], y_train[val_idx] for i in range(1): ix_train1, ix_val1, iy_train1, iy_val1 = train_test_split( x_train, y_train, test_size=0.2, shuffle=False) nb_trains = ix_train1.shape[0] // batch_size nb_examples = batch_size * nb_trains k_x_train = ix_train1[:nb_examples] k_y_train = iy_train1[:nb_examples] k_x_val = np.concatenate((ix_val1, ix_train1[nb_examples:]), axis=0) k_y_val = np.concatenate((iy_val1, iy_train1[nb_examples:]), axis=0) del ix_train1, ix_val1, iy_train1, iy_val1 # gc.collect() model_fn = os.path.join(_model_dir, '{0}-k{1}.hdf5'.format(tmp_out_name, i)) tensorboard_fn = os.path.join( _tb_dir, '{0}-tb_k{1}'.format(tmp_out_name, i)) csvlogger_fn = os.path.join( _csvlogger_dir, '{0}-csvlogger_k{1}'.format(tmp_out_name, i)) model = cnv_net(n_win_size, n_feat, n_class, filters=filters, kernel_size=16, strides=1, pool_size=2, pool_stride=2, drop=drop, blocks=blocks, fc_size=fc_size, m_name=tmp_out_name) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # # Horovod: average metrics among workers at the end of every epoch. # # # # Note: This callback must be in the list before the ReduceLROnPlateau, # # TensorBoard, or other metrics-based callbacks. # hvd.callbacks.MetricAverageCallback(), # # # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. # hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=verbose), # # # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. # hvd.callbacks.LearningRateScheduleCallback(start_epoch=5, end_epoch=30, multiplier=1.), # hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1), # hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2), # hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( EarlyStopping(monitor='val_acc', patience=5, verbose=1)) callbacks.append( AdvancedLearnignRateScheduler(monitor='val_acc', patience=1, verbose=1, mode='auto', decayRatio=learn_rate_decay)) callbacks.append( MultiGPUCheckpointCallback(model_fn, base_model=model, monitor='val_acc', save_best_only=True, verbose=1, save_weights_only=True)) callbacks.append( TensorBoard(tensorboard_fn, batch_size=batch_size, histogram_freq=2)) callbacks.append(CSVLogger(csvlogger_fn)) # Horovod: adjust learning rate based on number of GPUs. opt = keras.optimizers.Adam(lr=learn_rate) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) hist = model.fit(k_x_train, k_y_train, validation_data=(k_x_val, k_y_val), verbose=verbose, epochs=epoch, batch_size=batch_size, callbacks=callbacks) i_val_acc = hist.history['val_acc'][-1] print("Accuracy: {0:.6%}".format(i_val_acc)) val_acc_arr.append(i_val_acc) del model del k_x_train, k_y_train, k_x_val, k_y_val K.clear_session() gc.collect() i_config = tf.ConfigProto() i_config.gpu_options.allow_growth = True i_config.gpu_options.visible_device_list = str(hvd.local_rank()) i_sess = tf.Session(config=i_config) K.set_session(i_sess) cv_mean_val_acc = np.mean(val_acc_arr) global best_accuracy if cv_mean_val_acc > best_accuracy: best_accuracy = cv_mean_val_acc return -cv_mean_val_acc
def main(args): sample_id = args.sample_id in_bam_fn = args.in_bam_fn min_ratio = args.min_ratio win_size = args.win_size chr_id = args.chr_id ref_fa_f = args.ref_fa_f ref_map_f = args.ref_map_f step_size = args.step_size model_weight_fn = args.model_weight_fn out_dir = args.out_dir n_features = args.n_features n_classes = args.n_classes model_weight_name = os.path.splitext(os.path.basename(model_weight_fn))[0] model_in_lst = model_weight_name.split('-') model_name = model_in_lst[1] model_params_lst = re.findall(r"[-+]?\d*\.\d+|\d+", model_in_lst[0]) logging.info( 'model name: {0}, model params(batch, epoch, lr, drop, fc, block, win): {1}' .format(model_name, model_params_lst)) assert len(model_params_lst) >= 6 drop = float(model_params_lst[3]) fc_size = int(model_params_lst[4]) blocks = (int(x) for x in model_params_lst[5]) model = None if model_name == 'cnvnet': model = cnv_net(win_size, n_features, n_classes, drop=drop, blocks=blocks, fc_size=fc_size) model.load_weights(model_weight_fn) logging.info("finished loading model!") out_fn = os.path.join( out_dir, model_weight_name + '_' + sample_id + '.online_cnv_call') if os.path.exists(out_fn): os.remove(out_fn) # generate online features online_feat_obj = FeatureOnline(ref_fa_f, ref_map_f) online_feat_obj.load_bam(in_bam_fn) def online_call(tmp_online_feats, tmp_chr_id): for i, res in enumerate(tmp_online_feats): reg_start, reg_end, reg_len, out_indicator, f_mat = res if out_indicator == 3: # normalization i_x_max = np.max(f_mat, axis=-1) i_x_max[i_x_max == 0] = 1 f_mat = f_mat * 1.0 / i_x_max.reshape(n_features, 1) f_mat = np.transpose(f_mat) y_prob = model.predict(np.array([f_mat]), batch_size=1)[0] # batch_size=1? ypred_prob = ','.join(['{:.6f}'.format(x) for x in y_prob]) ypred = y_prob.argmax(axis=-1) with open(out_fn, 'a') as f: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( tmp_chr_id, reg_start, reg_end, reg_len, out_indicator, ypred_prob, ypred)) else: # save to the result out put with open(out_fn, 'a') as f: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( tmp_chr_id, reg_start, reg_end, reg_len, out_indicator, None, None)) del tmp_online_feats return 'chromosome {} cnv call done!'.format(tmp_chr_id) logging.info('begining calling cnvs for chromosome: {}'.format(chr_id)) if chr_id.upper() == 'A': # all chromosomes chr_lst_idx = [str(i) for i in range(1, 23)] # + ['X'] for i_chr_id in chr_lst_idx: online_feats = online_feat_obj.run(sample_id, i_chr_id, win_size=win_size, min_r=min_ratio, stride_size=step_size) logging.info(online_call(online_feats, i_chr_id)) else: online_feats = online_feat_obj.run(sample_id, chr_id, win_size=win_size, min_r=min_ratio, stride_size=step_size) logging.info(online_call(online_feats, chr_id)) logging.info('Sample {} cnv call completed, output: {}'.format( sample_id, out_fn))