Esempio n. 1
0
    def __init__(self, weight_path, win_size, n_feat, n_class, drop, _blocks,
                 fc_size):
        import tensorflow as tf
        from keras import backend as K
        from model import cnv_net

        K.clear_session()
        config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        K.set_session(sess)

        model_name = 'cnvnet'
        self.model = cnv_net(win_size,
                             n_feat,
                             n_class,
                             filters=32,
                             kernel_size=16,
                             strides=1,
                             pool_size=2,
                             pool_stride=2,
                             drop=drop,
                             blocks=_blocks,
                             fc_size=fc_size,
                             kernel_regular_l2=None,
                             m_name=model_name)
        self.model.load_weights(weight_path)
Esempio n. 2
0
    def __init__(self, weight_path, win_size, n_feat, n_class, drop, _blocks,
                 fc_size, l2ratio, temp, filters, kernel_size):
        import tensorflow as tf
        from keras import backend as K
        from model import cnv_net

        K.clear_session()
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.2  # (2core)0.3->0.15(4core)
        sess = tf.Session(config=config)
        K.set_session(sess)

        model_name = 'cnvnet'
        self.model = cnv_net(win_size,
                             n_feat,
                             n_class,
                             filters=filters,
                             kernel_size=kernel_size,
                             strides=1,
                             pool_size=2,
                             pool_stride=2,
                             drop=drop,
                             blocks=_blocks,
                             fc_size=fc_size,
                             kernel_regular_l2=l2ratio,
                             temperature=temp,
                             m_name=model_name)
        self.model.load_weights(weight_path)
Esempio n. 3
0
    def __init__(self,
                 weight_path,
                 win_size,
                 n_feat,
                 n_class,
                 drop,
                 _blocks,
                 fc_size,
                 n_proc=10):
        import tensorflow as tf
        from keras import backend as K
        from model import cnv_net

        K.clear_session()
        config = tf.ConfigProto(device_count={"CPU": n_proc},
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
        sess = tf.Session(config=config)
        K.set_session(sess)

        model_name = 'cnvnet'
        self.model = cnv_net(win_size,
                             n_feat,
                             n_class,
                             filters=32,
                             kernel_size=16,
                             strides=1,
                             pool_size=2,
                             pool_stride=2,
                             drop=drop,
                             blocks=_blocks,
                             fc_size=fc_size,
                             kernel_regular_l2=None,
                             m_name=model_name)
        self.model.load_weights(weight_path)
Esempio n. 4
0
    def __load_model(self, model_weight_fn):

        logger.info('loading model weight...')
        assert os.path.exists(model_weight_fn)
        model_weight_name = os.path.splitext(os.path.basename(model_weight_fn))[0]
        model_in_lst = model_weight_name.split('-')
        model_name = model_in_lst[1]
        model_params_lst = re.findall(r"[-+]?\d*\.\d+|\d+", model_in_lst[0])

        logging.info('model name: {0}, model params(batch, epoch, lr, drop, fc, block, win): {1}'.format(
            model_name, model_params_lst))
        assert len(model_params_lst) >= 6

        batch_size = int(model_params_lst[0])
        drop = float(model_params_lst[3])
        fc_size = int(model_params_lst[4])
        blocks = (int(x) for x in model_params_lst[5])
        win_size = int(model_params_lst[6])

        model = None
        if model_name == 'cnvnet':
            model = cnv_net(win_size, self.n_features, self.n_classes,
                            drop=drop, blocks=blocks, fc_size=fc_size)

        model.load_weights(model_weight_fn)
        return model, batch_size
Esempio n. 5
0
def online_call(online_seg_data_root_dir,
                online_call_out_root_dir,
                model_in_root_dir,
                n_win_size=1000,
                n_feat=13,
                n_class=3,
                epochs=64,
                batch=1024,
                learn_rate=0.001,
                drop=0.5,
                fc_size=64,
                blocks='4_4_3',
                step_size=200,
                sample_id='NA12878',
                chr_id='1',
                min_ratio=0.1,
                seg_range='a',
                cost_mat='221',
                n_proc=10):

    # get model name
    _blocks = (int(x) for x in blocks.split('_'))

    def out_name():
        str_blocks = [str(x) for x in blocks.split('_')]
        str_blk = ''.join(str_blocks)
        return 'b{0}_e{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_blk{5}_win{6}_cw{7}'.format(
            batch, epochs, learn_rate, drop, fc_size, str_blk, n_win_size,
            cost_mat)

    # set enviornment
    time.sleep(10)
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    logger.info('waiting available gpu device...')
    while True:
        gpu_id_lst = GPU.getFirstAvailable(order='random',
                                           maxMemory=0.001,
                                           maxLoad=0.001,
                                           attempts=50,
                                           interval=60)
        if len(gpu_id_lst) > 0:
            break
    logger.info('processing on device id {}...'.format(gpu_id_lst[0]))
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id_lst[0])

    logger.info('loading model...')
    K.clear_session()
    config = tf.ConfigProto(device_count={"CPU": n_proc},
                            intra_op_parallelism_threads=1,
                            inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    model_name = 'cnvnet'
    model = cnv_net(n_win_size,
                    n_feat,
                    n_class,
                    filters=32,
                    kernel_size=16,
                    strides=1,
                    pool_size=2,
                    pool_stride=2,
                    drop=drop,
                    blocks=_blocks,
                    fc_size=fc_size,
                    kernel_regular_l2=None,
                    m_name=model_name)

    model_weight_fn = os.path.join(model_in_root_dir,
                                   out_name() + '-' + model_name + '.hdf5')
    if not os.path.exists(model_weight_fn):
        raise FileNotFoundError(
            'model weight file not found. {}'.format(model_weight_fn))
    model.load_weights(model_weight_fn)

    # load data
    # default: online_seg_data_root_dir = '/zfssz2/ST_MCHRI/BIGDATA/PROJECT/NIPT_CNV/f_cnv_out/online'
    online_seg_data_subroot_dir = os.path.join(online_seg_data_root_dir,
                                               sample_id + '/data')
    if not os.path.isdir(online_seg_data_subroot_dir):
        raise FileNotFoundError(
            'No segments generated for sample {}, chr {}'.format(
                sample_id, chr_id))

    part_fname = 'win{0}_step{1}_r{2:.2f}_chr{3}_seg_'.format(
        n_win_size, step_size, min_ratio, chr_id)
    # create out dir
    # default: online_call_out_root_dir = '/zfssz2/ST_MCHRI/BIGDATA/PROJECT/NIPT_CNV/f_cnv_out/online'
    online_call_out_subroot_dir = os.path.join(online_call_out_root_dir,
                                               sample_id + '/cnv_call')
    if not os.path.isdir(online_call_out_subroot_dir):
        os.mkdir(online_call_out_subroot_dir)
    call_out_fn = os.path.join(
        online_call_out_subroot_dir,
        'win{0}_step{1}_r{2:.2f}_chr{3}-cnv-call-'.format(
            n_win_size, step_size, min_ratio, chr_id))
    if seg_range != 'a':  # if not whole chr
        part_fname = part_fname + 'p-' + seg_range + '_'
        call_out_fn = call_out_fn + 'p-' + seg_range + '-'
    else:
        part_fname = part_fname + 'a_'
        call_out_fn = call_out_fn + 'a-'

    call_out_fn = call_out_fn + 'result.csv'

    if os.path.exists(call_out_fn):
        os.remove(call_out_fn)

    gap_h5_fn = os.path.join(online_seg_data_subroot_dir,
                             part_fname + 'gap.h5')
    gap_pred = None
    if os.path.exists(gap_h5_fn):
        with h5py.File(gap_h5_fn, 'r') as gap_fn_read:
            gap_obj = gap_fn_read.get('gap')
            if gap_obj:
                gap_result = gap_obj.value
                tmp_arr = np.full((gap_result.shape[0], 4), -1)
                # tmp_arr[:] = np.nan
                gap_pred = np.concatenate((gap_result, tmp_arr), axis=1)
                del tmp_arr

    unpred_h5_fn_list = glob.glob(
        os.path.join(online_seg_data_subroot_dir, part_fname + 'unpred_*'))
    f_unpred_arr = None
    for i_unpred_fn in unpred_h5_fn_list:
        with h5py.File(i_unpred_fn, 'r') as i_uppred_fn_read:
            i_unpred_meta = i_uppred_fn_read.get('unpred_meta').value
        i_unpred_meta_arr = np.array(i_unpred_meta)

        tmp_arr = np.full((i_unpred_meta_arr.shape[0], 4), -1)
        # tmp_arr[:] = np.nan

        if f_unpred_arr is None:
            f_unpred_arr = np.concatenate((i_unpred_meta_arr, tmp_arr), axis=1)
        else:
            unpred_arr = np.concatenate((i_unpred_meta_arr, tmp_arr), axis=1)
            f_unpred_arr = np.concatenate((f_unpred_arr, unpred_arr), axis=0)

    pred_h5_fn_list = glob.glob(
        os.path.join(online_seg_data_subroot_dir, part_fname + 'pred_*'))

    logger.info('calling cnv...')
    f_pred_res = None

    for i, i_fn in enumerate(pred_h5_fn_list):
        logger.info('processing {}/{}:{}'.format(i + 1, len(pred_h5_fn_list),
                                                 i_fn))
        with h5py.File(i_fn, 'r') as i_fn_read:
            i_fn_pred_meta = i_fn_read.get('pred_meta').value
            i_fn_pred_feat = i_fn_read.get('pred_feat').value

        i_feat_arr = np.array(i_fn_pred_feat)
        del i_fn_pred_feat

        ypred = model.predict_on_batch(i_feat_arr)
        ypred_l = np.argmax(ypred, axis=1)

        assert len(i_fn_pred_meta) == ypred.shape[0]

        if f_pred_res is None:
            f_pred_res = np.concatenate(
                (np.array(i_fn_pred_meta), ypred, ypred_l[:, np.newaxis]),
                axis=1)
        else:
            i_pred_res = np.concatenate(
                (np.array(i_fn_pred_meta), ypred, ypred_l[:, np.newaxis]),
                axis=1)
            f_pred_res = np.concatenate((f_pred_res, i_pred_res), axis=0)

    logger.info('combining and sorting results...')
    # check
    if gap_pred is not None and f_unpred_arr is not None and f_pred_res is not None:
        whl_cnv_re = np.concatenate((gap_pred, f_unpred_arr, f_pred_res),
                                    axis=0)
    elif gap_pred is not None and f_unpred_arr is not None and f_pred_res is None:
        whl_cnv_re = np.concatenate((gap_pred, f_unpred_arr), axis=0)
    elif gap_pred is not None and f_pred_res is not None and f_unpred_arr is None:
        whl_cnv_re = np.concatenate((gap_pred, f_pred_res), axis=0)
    elif gap_pred is None and f_unpred_arr is not None and f_pred_res is not None:
        whl_cnv_re = np.concatenate((f_unpred_arr, f_pred_res), axis=0)
    elif f_pred_res is None and f_unpred_arr is None and gap_pred is not None:
        whl_cnv_re = gap_pred.copy()
    elif gap_pred is None and f_unpred_arr is None and f_pred_res is not None:
        whl_cnv_re = f_pred_res.copy()
    elif f_pred_res is None and gap_pred is None and f_unpred_arr is not None:
        whl_cnv_re = f_unpred_arr.copy()

    del gap_pred, f_unpred_arr, f_pred_res

    ind = np.argsort(whl_cnv_re[:, 0])
    whl_cnv_re = whl_cnv_re[ind]

    out_df = pd.DataFrame(data=whl_cnv_re,
                          columns=[
                              'seg_s', 'seg_e', 'seg_l', 'indicator', 'p_neu',
                              'p_del', 'p_dup', 'pred_l'
                          ])
    out_df[['seg_s', 'seg_e', 'seg_l', 'indicator', 'pred_l'
            ]] = out_df[['seg_s', 'seg_e', 'seg_l', 'indicator',
                         'pred_l']].astype(int)
    out_df.to_csv(call_out_fn, index=False, sep='\t')
    logger.info(
        'Done, online cnv call results saved into {}'.format(call_out_fn))
Esempio n. 6
0
def train(training_generator, validation_data, model_root_dir, epochs=50, batch=256,
          learn_rate=0.001, drop=0.5, fc_size=128, blocks='4_4_3', n_gpu=4, n_cpu=10):
    """
    epoch from 50 -> 60
    :param x_train:
    :param y_train:
    :param model_root_dir:
    :param epochs:
    :param batch:
    :param learn_rate:
    :param drop:
    :param fc_size:
    :param blocks:
    :param n_gpu:
    :return:
    """
    # n_example, n_win_size, n_feat = x_train.shape
    n_win_size =1000
    n_class = 3
    n_feat=13
    # k_x_train1, k_x_val1, k_y_train1, k_y_val1 = train_test_split(x_train, y_train, test_size=0.2, random_state=123)

    # nb_trains = k_x_train1.shape[0] // batch
    # nb_examples = batch * nb_trains
    # k_x_train = k_x_train1[:nb_examples]
    # k_y_train = k_y_train1[:nb_examples]
    # k_x_val = np.concatenate((k_x_val1, k_x_train1[nb_examples:]), axis=0)
    # k_y_val = np.concatenate((k_y_val1, k_y_train1[nb_examples:]), axis=0)
    # del k_x_train1, k_x_val1, k_y_train1, k_y_val1
    gc.collect()

    def out_name():
        str_blocks = [str(x) for x in blocks.split('_')]
        str_blk = ''.join(str_blocks)
        return 'b{0}_e{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_blk{5}_win{6}'.format(batch,
                                                                          epochs,
                                                                          learn_rate,
                                                                          drop,
                                                                          fc_size,
                                                                          str_blk,
                                                                          n_win_size)

    _blocks = (int(x) for x in blocks.split('_'))

    K.clear_session()
    # config = tf.ConfigProto()
    config = tf.ConfigProto(device_count={'GPU': n_gpu, 'CPU': n_cpu})
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    model_name = 'cnvnet'
    base_model = cnv_net(n_win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2,
                         pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size,
                         kernel_regular_l2=None, m_name=model_name)
    # base_model = cnv_net_seq(n_win_size, n_feat, n_class)
    if n_gpu > 1:
        model = multi_gpu_model(base_model, n_gpu)
    else:
        model = base_model

    _model_dir = os.path.join(model_root_dir, 'final_model/model_weight')
    if not os.path.isdir(_model_dir):
        os.makedirs(_model_dir)
    _tb_dir = os.path.join(model_root_dir, 'final_model/tb_logs')
    if not os.path.isdir(_tb_dir):
        os.makedirs(_tb_dir)
    _csvlogger_dir = os.path.join(model_root_dir, 'final_model/model_csvlogger')
    if not os.path.isdir(_csvlogger_dir):
        os.makedirs(_csvlogger_dir)

    model_fn = os.path.join(_model_dir, '{}-{}.hdf5'.format(out_name(), model_name))
    tensorboard_fn = os.path.join(_tb_dir, '{}-{}'.format(out_name(), model_name))
    csvlogger_fn = os.path.join(_csvlogger_dir, '{}-{}'.format(out_name(), model_name))
    callbacks = [
        # Early stopping definition
        EarlyStopping(monitor='val_acc', patience=5, verbose=1),
        # Decrease learning rate by 0.5 factor
        AdvancedLearnignRateScheduler(monitor='val_acc', patience=1, verbose=1, mode='auto', decayRatio=0.5),
        # Saving best model
        MultiGPUCheckpointCallback(model_fn, base_model=base_model, monitor='val_acc',
                                   save_best_only=True, verbose=1, save_weights_only=True),
        TensorBoard(tensorboard_fn, batch_size=batch, histogram_freq=2),
        CSVLogger(csvlogger_fn)
    ]

    model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # model.fit(k_x_train, k_y_train, validation_data=(k_x_val, k_y_val),
    #           epochs=epochs, batch_size=batch, callbacks=callbacks)

    model.fit_generator(generator=training_generator,
                    validation_data=validation_data,
                    epochs=5,
                    use_multiprocessing=True,
                    workers=10)
Esempio n. 7
0
def evaluate(model_root_dir, total_samples_ls_fn, test_ratio, win_size,
             min_size, min_r, min_f_deldup, min_f_neu, is_norm,
             model_weight_fn):

    xtest = []
    ytest = []
    # load test data set
    test_set_fn = os.path.join(
        model_root_dir, 'test{:.2f}_win{}_minsize{}_dataset.npz'.format(
            test_ratio, win_size, min_size))
    if os.path.exists(test_set_fn):
        with np.load(test_set_fn) as test_set:
            xtest = test_set['x_test']
            ytest = test_set['y_test']
    else:
        from keras.utils import to_categorical
        _, test_ids = get_train_test_ids(model_root_dir, total_samples_ls_fn,
                                         test_ratio)
        x_test = []
        y_test_ = []

        for ix, test_id in enumerate(test_ids):
            x, y = get_data(test_id, model_root_dir, win_size, min_r,
                            min_f_deldup, min_f_neu, is_norm)

            if (not (x is None)) and (not (y is None)):
                if len(x_test) == 0:
                    x_test = x
                    y_test_ = y
                else:
                    x_test = np.concatenate((x_test, x), axis=0)
                    y_test_ = np.concatenate((y_test_, y))

        y_ = [1 if x == 'DEL' else 2 if x == 'DUP' else 0 for x in y_test_]
        y_test = to_categorical(y_)
        logger.info('x test: {}, y test: {}'.format(x_test.shape,
                                                    y_test.shape))
        np.savez_compressed(test_set_fn, x_test=x_test, y_test=y_test)
        xtest = x_test
        ytest = y_test

    n_example, n_win_size, n_feat = xtest.shape
    n_class = ytest.shape[1]

    model_weight_name = os.path.splitext(os.path.basename(model_weight_fn))[0]
    model_in_lst = model_weight_name.split('-')
    model_name = model_in_lst[1]
    model_params_lst = re.findall(r"[-+]?\d*\.\d+|\d+", model_in_lst[0])
    logging.info(
        'model name: {0}, model params(batch, epoch, lr, drop, fc, block, win): {1}'
        .format(model_name, model_params_lst))

    assert len(model_params_lst) >= 6

    drop = float(model_params_lst[3])
    fc_size = int(model_params_lst[4])
    blocks = (int(x) for x in model_params_lst[5])

    model = None
    if model_name == 'cnvnet':
        model = cnv_net(n_win_size,
                        n_feat,
                        n_class,
                        drop=drop,
                        blocks=blocks,
                        fc_size=fc_size)

    model.load_weights(model_weight_fn)
    logging.info("finished loading model!")

    ypred = model.predict(xtest)
    test_pred_fn = os.path.join(
        model_root_dir, 'test_out/test{:.2f}_win{}_minsize{}_pred.npz'.format(
            test_ratio, win_size, min_size))
    np.savez_compressed(test_pred_fn, ypred=ypred, ytrue=ytest)
    logger.info('Predict Done, result saved at {}'.format(test_pred_fn))
Esempio n. 8
0
def evaluate(test_sample_fn, test_out_root_dir, model_in_root_dir,
             n_win_size=1000, n_feat=13, n_class=3,
             epochs=64, batch=1024, learn_rate=0.001, drop=0.5,
             fc_size=64, blocks='4_4_3', n_cpu=20, class_weights=None):
    # get model name
    _blocks = tuple(int(x) for x in blocks.split('_'))

    def out_name():
        str_blocks = [str(x) for x in blocks.split('_')]
        str_blk = ''.join(str_blocks)
        if class_weights is not None:
            class_weight_label = ''.join(np.array([class_weights[1], class_weights[2], class_weights[0]]).astype(str))
        else:
            class_weight_label = '111'
        return 'b{0}_e{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_blk{5}_win{6}_cw{7}'.format(batch,
                                                                                epochs,
                                                                                learn_rate,
                                                                                drop,
                                                                                fc_size,
                                                                                str_blk,
                                                                                n_win_size,
                                                                                class_weight_label)

    logger.info('loading model... ')

    K.clear_session()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    model_name = 'cnvnet'
    model = cnv_net(n_win_size, n_feat, n_class, filters=32, kernel_size=16, strides=1, pool_size=2,
                    pool_stride=2, drop=drop, blocks=_blocks, fc_size=fc_size,
                    kernel_regular_l2=None, m_name=model_name)
    model_weight_fn = os.path.join(model_in_root_dir, out_name() + '-' + model_name + '.hdf5')
    if not os.path.exists(model_weight_fn):
        raise FileNotFoundError('model weight file not found. {}'.format(model_weight_fn))
    model.load_weights(model_weight_fn)

    test_out_dir = os.path.join(test_out_root_dir, 'test_out')
    if not os.path.isdir(test_out_dir):
        os.mkdir(test_out_dir)
    test_out_fn = os.path.join(test_out_dir, out_name() + '-offline-test.h5')

    test_sample_df = pd.read_csv(test_sample_fn, sep='\t')

    # slow when using generator
    test_samples_fn_arr = test_sample_df[['f_name']].values
    test_samples_true_arr = test_sample_df['cnv_type_encode'].values

    test_batch_generator = CNVDataGenerator(test_samples_fn_arr, batch,
                                            win_size=n_win_size, n_feat=n_feat, n_classes=n_class,
                                            shuffle=False, pred_gen=True)
    # use predict_generator will produce only batch_size * n outputs
    ypred = model.predict_generator(test_batch_generator,
                                    verbose=1, use_multiprocessing=True,
                                    workers=n_cpu, max_queue_size=64)

    logger.info('writing predicted results into file...')
    with h5py.File(test_out_fn, 'w') as test_out:
        test_out.create_dataset('ypred', data=ypred)
        test_out.create_dataset('ytrue', data=test_samples_true_arr)

    logger.info('Predict Done, result saved at {}'.format(test_out_fn))
Esempio n. 9
0
def train3(target_train_h5_fn,
           target_val_h5_fn,
           model_out_root_dir,
           n_win_size=5000,
           n_feat=13,
           n_class=3,
           filters=32,
           kernel_size=16,
           epochs=64,
           batch=1024,
           learn_rate=0.001,
           drop=0.5,
           fc_size=64,
           blocks='4_4_3',
           l2r=1e-4,
           temperature=5,
           lbl_smt_frac=0.1,
           pw=0,
           n_gpu=4,
           n_cpu=20):

    _blocks = tuple(int(x) for x in blocks.split('_'))

    def out_name():
        str_blocks = [str(x) for x in blocks.split('_')]
        str_blk = ''.join(str_blocks)
        return 'b{0}_ep{1}_lr{2:.3f}_dr{3:.1f}_fc{4}_' \
               'blk{5}_win{6}_l2r{7}_temp{8}_lblsmt{9}_pw{10}'.format(batch,
                                                                      epochs,
                                                                      learn_rate,
                                                                      drop,
                                                                      fc_size,
                                                                      str_blk,
                                                                      n_win_size,
                                                                      str(l2r),
                                                                      temperature,
                                                                      int(
                                                                          lbl_smt_frac) if lbl_smt_frac == 0 else lbl_smt_frac,
                                                                      pw)

    K.clear_session()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    K.set_session(sess)

    model_name = 'cnvnet'
    base_model = cnv_net(n_win_size,
                         n_feat,
                         n_class,
                         filters=filters,
                         kernel_size=kernel_size,
                         strides=1,
                         pool_size=2,
                         pool_stride=2,
                         drop=drop,
                         blocks=_blocks,
                         fc_size=fc_size,
                         kernel_regular_l2=l2r,
                         temperature=temperature,
                         m_name=model_name)

    # base_model = cnv_net_seq(n_win_size, n_feat, n_class)
    if n_gpu > 1:
        model = multi_gpu_model(base_model, n_gpu)
    else:
        model = base_model

    _model_dir = os.path.join(model_out_root_dir, 'model_weight')
    if not os.path.isdir(_model_dir):
        os.mkdir(_model_dir)
    _tb_dir = os.path.join(model_out_root_dir, 'tb_logs')
    if not os.path.isdir(_tb_dir):
        os.mkdir(_tb_dir)
    _csvlogger_dir = os.path.join(model_out_root_dir, 'model_csvlogger')
    if not os.path.isdir(_csvlogger_dir):
        os.mkdir(_csvlogger_dir)

    model_fn = os.path.join(_model_dir,
                            '{}-{}.hdf5'.format(out_name(), model_name))
    if os.path.exists(model_fn):
        os.remove(model_fn)

    tensorboard_fn = os.path.join(_tb_dir,
                                  '{}-{}'.format(out_name(), model_name))
    if os.path.isdir(tensorboard_fn):
        shutil.rmtree(tensorboard_fn, ignore_errors=True)

    csvlogger_fn = os.path.join(_csvlogger_dir,
                                '{}-{}'.format(out_name(), model_name))
    if os.path.exists(csvlogger_fn):
        os.remove(csvlogger_fn)

    with h5py.File(target_train_h5_fn, 'r') as train_h5:
        train_len = train_h5['x'].shape[0]
    with h5py.File(target_val_h5_fn, 'r') as val_h5:
        val_len = val_h5['x'].shape[0]

    callbacks = [
        # Early stopping definition
        EarlyStopping(monitor='val_fmeasure',
                      mode='max',
                      patience=10,
                      verbose=1),
        # Decrease learning rate by 0.5 factor
        AdvancedLearnignRateScheduler(monitor='val_fmeasure',
                                      patience=1,
                                      verbose=1,
                                      mode='max',
                                      decayRatio=0.5),
        # CyclicLR(mode='triangular', base_lr=learn_rate, max_lr=0.1, step_size=6 * (train_len // batch)),
        # Saving best model
        MultiGPUCheckpointCallback(model_fn,
                                   base_model=base_model,
                                   monitor='val_fmeasure',
                                   mode='max',
                                   save_best_only=True,
                                   verbose=1,
                                   save_weights_only=True),
        # histogram_freq=0 because
        # ValueError: If printing histograms, validation_data must be provided, and cannot be a generator
        # set histogram_freq=0 to solve the problem
        # TensorBoard(tensorboard_fn, batch_size=batch, histogram_freq=0),
        CSVLogger(csvlogger_fn)
    ]

    # fine tune the cost function so that missclassification is weighted some how
    p_misclass_weight = np.ones((n_class, n_class))
    # there could be improved
    # penalizing FN
    p_misclass_weight[:, 0] = 2.0
    # penalizing FP
    p_misclass_weight[0, :] = 2.0
    p_misclass_weight[1, 2] = 2.0
    p_misclass_weight[2, 1] = 2.0
    p_misclass_weight[0, 0] = 1.0

    p_misclass_weight2 = np.ones((n_class, n_class))
    # there could be improved
    # penalizing FN
    p_misclass_weight2[:, 0] = 2.0
    # penalizing FP
    p_misclass_weight2[0, :] = 1.5
    p_misclass_weight2[0, 0] = 1.0

    misclass_dict = {1: p_misclass_weight, 2: p_misclass_weight2}

    # using weight loss funtion
    # https://github.com/keras-team/keras/issues/2115
    if pw > 0:
        custom_loss = partial(w_categorical_crossentropy,
                              weights=misclass_dict[pw])
        custom_loss.__name__ = 'w_categorical_crossentropy'
    elif pw == 0:
        custom_loss = 'categorical_crossentropy'
    elif pw == -1:
        custom_loss = focal_loss(alpha=1.0)
    elif pw == -2:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.2,
                                                 beta_t=0.8,
                                                 gamma_f=2.,
                                                 alpha_f=1.)
    elif pw == -3:
        custom_loss = combine_weighted_ce_focal_loss(weights=misclass_dict[1],
                                                     gamma_f=2.,
                                                     alpha_f=1.)
    elif pw == -4:
        custom_loss = combine_tversky_focal_loss(alpha_t=1,
                                                 beta_t=1,
                                                 gamma_f=2.,
                                                 alpha_f=1.)
    elif pw == -5:
        custom_loss = tversky_loss(alpha=0.5, beta=0.5)
    elif pw == -6:
        custom_loss = tversky_loss(alpha=0.8, beta=0.2)
    elif pw == -7:
        custom_loss = tversky_loss(alpha=0.1, beta=0.9)
    elif pw == -8:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.8,
                                                 beta_t=0.2,
                                                 gamma_f=2.,
                                                 alpha_f=1.)
        custom_loss.__name__ = 'tversky_focal_loss'
    elif pw == -9:
        custom_loss = combine_tversky_focal_loss(alpha_t=4.0,
                                                 beta_t=2.0,
                                                 gamma_f=2.,
                                                 alpha_f=1.)
    elif pw == -10:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.8,
                                                 beta_t=0.2,
                                                 gamma_f=4.,
                                                 alpha_f=1.)
    elif pw == -11:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.8,
                                                 beta_t=0.2,
                                                 gamma_f=1.5,
                                                 alpha_f=1.)
    elif pw == -12:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.7,
                                                 beta_t=0.3,
                                                 gamma_f=2,
                                                 alpha_f=1.)
    elif pw == -13:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.8,
                                                 beta_t=0.2,
                                                 gamma_f=2.5,
                                                 alpha_f=1.)
    elif pw == -14:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.8,
                                                 beta_t=0.2,
                                                 gamma_f=3,
                                                 alpha_f=1.)
    elif pw == -15:  # call
        custom_loss = combine_tversky_focal_loss(alpha_t=0.9,
                                                 beta_t=0.1,
                                                 gamma_f=2.0,
                                                 alpha_f=1.)
    elif pw == -16:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.6,
                                                 beta_t=0.4,
                                                 gamma_f=2.0,
                                                 alpha_f=1.)
    elif pw == -17:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.7,
                                                 beta_t=0.3,
                                                 gamma_f=1.5,
                                                 alpha_f=1.)
    elif pw == -18:
        custom_loss = combine_tversky_focal_loss(alpha_t=0.6,
                                                 beta_t=0.4,
                                                 gamma_f=1.5,
                                                 alpha_f=1.)

    model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate),
                  loss=custom_loss,
                  metrics=['accuracy', precision, recall, fmeasure])

    # model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate),
    #               loss='categorical_crossentropy',
    #               metrics=['accuracy'])

    training_batch_generator = CNVDataGenerator3(target_train_h5_fn,
                                                 train_len,
                                                 batch,
                                                 win_size=n_win_size,
                                                 n_feat=n_feat,
                                                 smooth_factor=lbl_smt_frac)
    val_batch_generator = CNVDataGenerator3(target_val_h5_fn,
                                            val_len,
                                            batch,
                                            win_size=n_win_size,
                                            n_feat=n_feat,
                                            smooth_factor=lbl_smt_frac)

    # fit_generator
    model.fit_generator(
        generator=training_batch_generator,
        steps_per_epoch=(train_len // batch),
        epochs=epochs,
        verbose=1,
        # class_weight=None,  # class_weight=class_weights
        callbacks=callbacks,
        validation_data=val_batch_generator,
        validation_steps=(val_len // batch),
        use_multiprocessing=True,  # True
        workers=n_cpu,  # n_cpu
        max_queue_size=(n_cpu + 10))  # 80
Esempio n. 10
0
    def gp_fitness(batch_size, epoch, learn_rate, drop, fc_size, blocks):

        batch_size = int(batch_size)
        epoch = int(epoch)
        learn_rate = float(learn_rate)
        drop = float(drop)
        fc_size = int(fc_size)

        print('batch_size: {}'.format(batch_size))
        print('epoch: {}'.format(epoch))
        print('learn rate: {0:.3f}'.format(learn_rate))
        print('drop ratio: {0:.1f}'.format(drop))
        print('fc size: {}'.format(fc_size))
        print('blocks: {}'.format(blocks))

        _blocks = (int(x) for x in blocks.split('_'))

        tmp_out_name = out_name(batch_size, epoch, learn_rate, drop, fc_size,
                                blocks)

        val_acc_arr = []

        # for i, (train_idx, val_idx) in enumerate(skf.split(x_train, y_train_labels)):
        # ix_train1, ix_val1 = x_train[train_idx], x_train[val_idx]
        # iy_train1, iy_val1 = y_train[train_idx], y_train[val_idx]
        for i in range(1):
            ix_train1, ix_val1, iy_train1, iy_val1 = train_test_split(
                x_train, y_train, test_size=0.2, random_state=123)
            nb_trains = ix_train1.shape[0] // batch_size
            nb_examples = batch_size * nb_trains
            k_x_train = ix_train1[:nb_examples]
            k_y_train = iy_train1[:nb_examples]
            k_x_val = np.concatenate((ix_val1, ix_train1[nb_examples:]),
                                     axis=0)
            k_y_val = np.concatenate((iy_val1, iy_train1[nb_examples:]),
                                     axis=0)

            del ix_train1, ix_val1, iy_train1, iy_val1
            # gc.collect()

            model_fn = os.path.join(_model_dir,
                                    '{0}-k{1}.hdf5'.format(tmp_out_name, i))
            tensorboard_fn = os.path.join(
                _tb_dir, '{0}-tb_k{1}'.format(tmp_out_name, i))
            csvlogger_fn = os.path.join(
                _csvlogger_dir, '{0}-csvlogger_k{1}'.format(tmp_out_name, i))

            base_model = cnv_net(n_win_size,
                                 n_feat,
                                 n_class,
                                 filters=32,
                                 kernel_size=16,
                                 strides=1,
                                 pool_size=2,
                                 pool_stride=2,
                                 drop=drop,
                                 blocks=_blocks,
                                 fc_size=fc_size,
                                 kernel_regular_l2=None,
                                 m_name=tmp_out_name)
            if n_gpu > 1:
                model = multi_gpu_model(base_model, n_gpu)
            else:
                model = base_model

            callbacks = [
                # Early stopping definition
                EarlyStopping(monitor='val_acc', patience=5, verbose=1),
                # Decrease learning rate
                AdvancedLearnignRateScheduler(monitor='val_acc',
                                              patience=1,
                                              verbose=1,
                                              mode='auto',
                                              decayRatio=0.5),
                # Saving best model
                # MultiGPUCheckpointCallback(model_fn, base_model=base_model, monitor='val_acc',
                #                            save_best_only=True, verbose=1, save_weights_only=True),
                # TensorBoard(tensorboard_fn, batch_size=batch_size, histogram_freq=2),
                CSVLogger(csvlogger_fn)
            ]

            model.compile(optimizer=keras.optimizers.Adam(lr=learn_rate),
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
            hist = model.fit(k_x_train,
                             k_y_train,
                             validation_data=(k_x_val, k_y_val),
                             epochs=epoch,
                             batch_size=batch_size,
                             callbacks=callbacks)

            i_val_acc = hist.history['val_acc'][-1]
            print("Accuracy: {0:.6%}".format(i_val_acc))
            val_acc_arr.append(i_val_acc)

            del model
            del k_x_train, k_y_train, k_x_val, k_y_val

            K.clear_session()
            gc.collect()
            i_config = tf.ConfigProto()
            i_config.gpu_options.allow_growth = True
            # i_config = tf.ConfigProto(device_count={'GPU': n_gpu, 'CPU': n_cpu})
            i_sess = tf.Session(config=i_config)
            K.set_session(i_sess)

        cv_mean_val_acc = np.mean(val_acc_arr)

        global best_accuracy
        if cv_mean_val_acc > best_accuracy:
            best_accuracy = cv_mean_val_acc

        return -cv_mean_val_acc
Esempio n. 11
0
    def gp_fitness(batch_size, epoch, learn_rate, learn_rate_decay, filters,
                   drop, fc_size, blocks):
        print('batch_size: {}'.format(batch_size))
        print('epoch: {}'.format(epoch))
        print('learn rate: {0:.3f}'.format(learn_rate))
        print('learn rate decay: {0:.1f}'.format(learn_rate_decay))
        print('filters: {}'.format(filters))
        print('drop ratio: {0:.1f}'.format(drop))
        print('fc size: {}'.format(fc_size))
        print('blocks: {}'.format(blocks))

        tmp_out_name = out_name(batch_size, epoch, learn_rate,
                                learn_rate_decay, filters, drop, fc_size,
                                blocks)

        val_acc_arr = []

        # for i, (train_idx, val_idx) in enumerate(skf.split(x_train, y_train_labels)):
        # ix_train1, ix_val1 = x_train[train_idx], x_train[val_idx]
        # iy_train1, iy_val1 = y_train[train_idx], y_train[val_idx]
        for i in range(1):
            ix_train1, ix_val1, iy_train1, iy_val1 = train_test_split(
                x_train, y_train, test_size=0.2, shuffle=False)
            nb_trains = ix_train1.shape[0] // batch_size
            nb_examples = batch_size * nb_trains
            k_x_train = ix_train1[:nb_examples]
            k_y_train = iy_train1[:nb_examples]
            k_x_val = np.concatenate((ix_val1, ix_train1[nb_examples:]),
                                     axis=0)
            k_y_val = np.concatenate((iy_val1, iy_train1[nb_examples:]),
                                     axis=0)

            del ix_train1, ix_val1, iy_train1, iy_val1
            # gc.collect()

            model_fn = os.path.join(_model_dir,
                                    '{0}-k{1}.hdf5'.format(tmp_out_name, i))
            tensorboard_fn = os.path.join(
                _tb_dir, '{0}-tb_k{1}'.format(tmp_out_name, i))
            csvlogger_fn = os.path.join(
                _csvlogger_dir, '{0}-csvlogger_k{1}'.format(tmp_out_name, i))

            model = cnv_net(n_win_size,
                            n_feat,
                            n_class,
                            filters=filters,
                            kernel_size=16,
                            strides=1,
                            pool_size=2,
                            pool_stride=2,
                            drop=drop,
                            blocks=blocks,
                            fc_size=fc_size,
                            m_name=tmp_out_name)

            callbacks = [
                # Horovod: broadcast initial variable states from rank 0 to all other processes.
                # This is necessary to ensure consistent initialization of all workers when
                # training is started with random weights or restored from a checkpoint.
                hvd.callbacks.BroadcastGlobalVariablesCallback(0),

                # # Horovod: average metrics among workers at the end of every epoch.
                # #
                # # Note: This callback must be in the list before the ReduceLROnPlateau,
                # # TensorBoard, or other metrics-based callbacks.
                # hvd.callbacks.MetricAverageCallback(),
                #
                # # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
                # # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
                # # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
                # hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=verbose),
                #
                # # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
                # hvd.callbacks.LearningRateScheduleCallback(start_epoch=5, end_epoch=30, multiplier=1.),
                # hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1),
                # hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2),
                # hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3),
            ]

            # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
            if hvd.rank() == 0:
                callbacks.append(
                    EarlyStopping(monitor='val_acc', patience=5, verbose=1))
                callbacks.append(
                    AdvancedLearnignRateScheduler(monitor='val_acc',
                                                  patience=1,
                                                  verbose=1,
                                                  mode='auto',
                                                  decayRatio=learn_rate_decay))
                callbacks.append(
                    MultiGPUCheckpointCallback(model_fn,
                                               base_model=model,
                                               monitor='val_acc',
                                               save_best_only=True,
                                               verbose=1,
                                               save_weights_only=True))
                callbacks.append(
                    TensorBoard(tensorboard_fn,
                                batch_size=batch_size,
                                histogram_freq=2))
                callbacks.append(CSVLogger(csvlogger_fn))

            # Horovod: adjust learning rate based on number of GPUs.
            opt = keras.optimizers.Adam(lr=learn_rate)
            # Horovod: add Horovod Distributed Optimizer.
            opt = hvd.DistributedOptimizer(opt)

            model.compile(optimizer=opt,
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
            hist = model.fit(k_x_train,
                             k_y_train,
                             validation_data=(k_x_val, k_y_val),
                             verbose=verbose,
                             epochs=epoch,
                             batch_size=batch_size,
                             callbacks=callbacks)

            i_val_acc = hist.history['val_acc'][-1]
            print("Accuracy: {0:.6%}".format(i_val_acc))
            val_acc_arr.append(i_val_acc)

            del model
            del k_x_train, k_y_train, k_x_val, k_y_val

            K.clear_session()
            gc.collect()
            i_config = tf.ConfigProto()
            i_config.gpu_options.allow_growth = True
            i_config.gpu_options.visible_device_list = str(hvd.local_rank())
            i_sess = tf.Session(config=i_config)
            K.set_session(i_sess)

        cv_mean_val_acc = np.mean(val_acc_arr)

        global best_accuracy
        if cv_mean_val_acc > best_accuracy:
            best_accuracy = cv_mean_val_acc

        return -cv_mean_val_acc
Esempio n. 12
0
def main(args):
    sample_id = args.sample_id
    in_bam_fn = args.in_bam_fn
    min_ratio = args.min_ratio
    win_size = args.win_size
    chr_id = args.chr_id
    ref_fa_f = args.ref_fa_f
    ref_map_f = args.ref_map_f
    step_size = args.step_size

    model_weight_fn = args.model_weight_fn
    out_dir = args.out_dir

    n_features = args.n_features
    n_classes = args.n_classes

    model_weight_name = os.path.splitext(os.path.basename(model_weight_fn))[0]
    model_in_lst = model_weight_name.split('-')
    model_name = model_in_lst[1]
    model_params_lst = re.findall(r"[-+]?\d*\.\d+|\d+", model_in_lst[0])

    logging.info(
        'model name: {0}, model params(batch, epoch, lr, drop, fc, block, win): {1}'
        .format(model_name, model_params_lst))

    assert len(model_params_lst) >= 6

    drop = float(model_params_lst[3])
    fc_size = int(model_params_lst[4])
    blocks = (int(x) for x in model_params_lst[5])

    model = None
    if model_name == 'cnvnet':
        model = cnv_net(win_size,
                        n_features,
                        n_classes,
                        drop=drop,
                        blocks=blocks,
                        fc_size=fc_size)

    model.load_weights(model_weight_fn)
    logging.info("finished loading model!")

    out_fn = os.path.join(
        out_dir, model_weight_name + '_' + sample_id + '.online_cnv_call')
    if os.path.exists(out_fn):
        os.remove(out_fn)

    # generate online features
    online_feat_obj = FeatureOnline(ref_fa_f, ref_map_f)
    online_feat_obj.load_bam(in_bam_fn)

    def online_call(tmp_online_feats, tmp_chr_id):
        for i, res in enumerate(tmp_online_feats):
            reg_start, reg_end, reg_len, out_indicator, f_mat = res
            if out_indicator == 3:
                # normalization
                i_x_max = np.max(f_mat, axis=-1)
                i_x_max[i_x_max == 0] = 1
                f_mat = f_mat * 1.0 / i_x_max.reshape(n_features, 1)
                f_mat = np.transpose(f_mat)
                y_prob = model.predict(np.array([f_mat]),
                                       batch_size=1)[0]  # batch_size=1?
                ypred_prob = ','.join(['{:.6f}'.format(x) for x in y_prob])
                ypred = y_prob.argmax(axis=-1)
                with open(out_fn, 'a') as f:
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        tmp_chr_id, reg_start, reg_end, reg_len, out_indicator,
                        ypred_prob, ypred))

            else:
                # save to the result out put
                with open(out_fn, 'a') as f:
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        tmp_chr_id, reg_start, reg_end, reg_len, out_indicator,
                        None, None))
        del tmp_online_feats
        return 'chromosome {} cnv call done!'.format(tmp_chr_id)

    logging.info('begining calling cnvs for chromosome: {}'.format(chr_id))
    if chr_id.upper() == 'A':  # all chromosomes
        chr_lst_idx = [str(i) for i in range(1, 23)]  # + ['X']
        for i_chr_id in chr_lst_idx:
            online_feats = online_feat_obj.run(sample_id,
                                               i_chr_id,
                                               win_size=win_size,
                                               min_r=min_ratio,
                                               stride_size=step_size)
            logging.info(online_call(online_feats, i_chr_id))
    else:
        online_feats = online_feat_obj.run(sample_id,
                                           chr_id,
                                           win_size=win_size,
                                           min_r=min_ratio,
                                           stride_size=step_size)
        logging.info(online_call(online_feats, chr_id))

    logging.info('Sample {} cnv call completed, output: {}'.format(
        sample_id, out_fn))