def predict_all(data_name, X, use_cache=False, child_cv_index=None): """予測。""" cache_path = CACHE_DIR / data_name / f'{MODEL_NAME}.pkl' if use_cache and cache_path.is_file() and child_cv_index is None: return joblib.load(cache_path) if data_name == 'test' and child_cv_index is None: pred_bin = np.median(joblib.load(CACHE_DIR / 'test' / 'bin_nas.pkl'), axis=0) pred_reg = np.median(joblib.load(CACHE_DIR / 'test' / 'reg_nas.pkl'), axis=0) ath_estimator = joblib.load(MODELS_DIR / 'ath_estimator.pkl') pred = [] for cci in range(5): y_child = [] for pc in predict_all(data_name, X, use_cache, cci): threshold_X = _ath.create_input_data(pc, pred_bin, pred_reg) thresholds = ath_estimator.predict(threshold_X) y_child.append(pc > np.reshape(thresholds, (len(pc), 1, 1, 1))) pred.append(np.mean(y_child, axis=0)) cache_path.parent.mkdir(parents=True, exist_ok=True) joblib.dump(pred, cache_path, compress=3) return pred if data_name == 'val': X_val = _mf.get_meta_features(data_name, X) X_list, vi_list = [], [] split_seed = int((MODELS_DIR / 'split_seed.txt').read_text()) for cv_index in range(CV_COUNT): _, vi = tk.ml.cv_indices(X_val, None, cv_count=CV_COUNT, cv_index=cv_index, split_seed=split_seed, stratify=False) X_list.append(X_val[vi]) vi_list.append(vi) else: X_test = _mf.get_meta_features(data_name, X, child_cv_index) X_list = [X_test] * CV_COUNT gen = tk.generator.SimpleGenerator() model = tk.dl.models.Model.load(MODELS_DIR / f'model.fold0.h5', gen, batch_size=BATCH_SIZE, multi_gpu=True) pred_list = [] for cv_index in tk.tqdm(range(CV_COUNT), desc='predict'): if cv_index != 0: model.load_weights(MODELS_DIR / f'model.fold{cv_index}.h5') X_t = X_list[cv_index] pred = np.mean([ model.predict(X_t, verbose=0), model.predict(X_t[:, :, ::-1, :], verbose=0)[:, :, ::-1, :], ], axis=0) pred_list.append(pred) if data_name == 'val': pred = np.empty((len(X), 101, 101, 1), dtype=np.float32) for vi, p in zip(vi_list, pred_list): pred[vi] = p else: pred = pred_list if data_name != 'test': cache_path.parent.mkdir(parents=True, exist_ok=True) joblib.dump(pred, cache_path, compress=3) return pred
def _main(): base_dir = pathlib.Path(__file__).resolve().parent vocdevkit_dir = base_dir / 'pytoolkit' / 'data' / 'VOCdevkit' save_dir = base_dir / '___assign_check' save_dir.mkdir(exist_ok=True) X_val, y_val = tk.data.voc.load_07_test(vocdevkit_dir) X_val, y_val = X_val[:1], y_val[:1] od = tk.dl.od.ObjectDetector.load_voc(batch_size=1, use_multi_gpu=False) gen = tk.dl.od.od_gen.create_generator((512, 512), preprocess_input=lambda x: x, encode_truth=od.pb.encode_truth) g, _ = gen.flow(X_val, y_val, data_augmentation=True) for i, (X_batch, y_batch) in zip(tk.tqdm(range(32)), g): for rgb, y in zip(X_batch, y_batch): obj_pb = y[:, 1] == 1 # assignされたもの全部 classes = np.argmax(y[obj_pb, 2:-4], axis=-1) bboxes = od.pb.decode_locs(np.zeros((len(y), 4)), xp=np)[obj_pb, :] # prior box自体の座標を取る img = tk.ml.plot_objects(rgb, classes, None, bboxes, tk.data.voc.CLASS_NAMES) tk.ndimage.save(save_dir / f'{i}.jpg', img)
def _load_image(X): X = np.array([ cv2.imread(str(p), cv2.IMREAD_GRAYSCALE).astype(np.float32) for p in tk.tqdm(X, desc='load') ]) X = np.expand_dims(X, axis=-1) return X
def create_data(y, pred, pred_bin, pred_reg): """Create adaptive threshold data.""" threshold_X = create_input_data(pred, pred_bin, pred_reg) mask_neg = y.max(axis=(1, 2, 3)) == 0 mask_pos = np.logical_not(mask_neg) y_pos = y > 0.5 threshold_y = np.empty((len(y), )) th_neg = (pred[mask_neg].max(axis=(1, 2, 3)) + 1) / 2 threshold_y[mask_neg] = np.minimum(np.maximum(th_neg, MIN_THRESHOLD), MAX_THRESHOLD) for i in tk.tqdm(np.where(mask_pos)[0], desc='ath'): threshold_list = np.linspace(MIN_THRESHOLD, MAX_THRESHOLD, 1000) iou_list = [] for th in threshold_list: pred_pos = pred[i] > th inter = np.logical_and(pred_pos, y_pos[i]) union = np.logical_or(pred_pos, y_pos[i]) iou = np.sum(inter) / max(np.sum(union), 1) iou_list.append(iou) best_index = np.argmax(iou_list) threshold_y[i] = threshold_list[best_index] threshold_y = np.array(threshold_y) return threshold_X, threshold_y
def predict_all(data_name, X, use_cache=False, child_cv_index=None): """予測。""" cache_path = CACHE_DIR / data_name / f'{MODEL_NAME}.pkl' if use_cache and cache_path.is_file() and child_cv_index is None: return joblib.load(cache_path) if data_name == 'test' and child_cv_index is None: pred = [predict_all(data_name, X, use_cache, i) for i in range(5)] cache_path.parent.mkdir(parents=True, exist_ok=True) joblib.dump(pred, cache_path, compress=3) return pred if data_name == 'val': X_val = get_meta_features(data_name, X) X_list, vi_list = [], [] split_seed = int((MODELS_DIR / 'split_seed.txt').read_text()) for cv_index in range(CV_COUNT): _, vi = tk.ml.cv_indices(X_val, None, cv_count=CV_COUNT, cv_index=cv_index, split_seed=split_seed, stratify=False) X_list.append(X_val[vi]) vi_list.append(vi) else: X_test = get_meta_features(data_name, X, child_cv_index) X_list = [X_test] * CV_COUNT gen = tk.generator.SimpleGenerator() model = tk.dl.models.Model.load(MODELS_DIR / f'model.fold0.h5', gen, batch_size=BATCH_SIZE, multi_gpu=True) pred_list = [] for cv_index in tk.tqdm(range(CV_COUNT), desc='predict'): if cv_index != 0: model.load_weights(MODELS_DIR / f'model.fold{cv_index}.h5') X_t = X_list[cv_index] pred = model.predict(X_t, verbose=0) pred_list.append(pred) if data_name == 'val': pred = np.empty((len(X), 101, 101, 1), dtype=np.float32) for vi, p in zip(vi_list, pred_list): pred[vi] = p else: pred = np.mean(pred_list, axis=0) if data_name != 'test': cache_path.parent.mkdir(parents=True, exist_ok=True) joblib.dump(pred, cache_path, compress=3) return pred
def get_score_fixed_threshold(y_val, pred_val, search_th=True): """適当スコア算出。""" if search_th: threshold_list = np.linspace(0.3, 0.7, 100) score_list = [] for th in tk.tqdm(threshold_list, desc='threshold'): score = compute_score(y_val > 0.5, pred_val > th) score_list.append(score) best_index = np.argmax(score_list) threshold = threshold_list[best_index] score = score_list[best_index] else: threshold = 0.5 score = compute_score(y_val > 0.5, pred_val > threshold) return score, threshold
def predict_all(data_name, X, use_cache=False): """予測。""" cache_path = CACHE_DIR / data_name / f'{MODEL_NAME}.pkl' if use_cache and cache_path.is_file(): return joblib.load(cache_path) if data_name == 'val': X_list, vi_list = [], [] split_seed = int((MODELS_DIR / 'split_seed.txt').read_text()) for cv_index in range(CV_COUNT): _, vi = tk.ml.cv_indices(X, None, cv_count=CV_COUNT, cv_index=cv_index, split_seed=split_seed, stratify=False) X_list.append(X[vi]) vi_list.append(vi) else: X = _data.load_test_data() X_list = [X] * CV_COUNT gen = tk.generator.SimpleGenerator() model = tk.dl.models.Model.load(MODELS_DIR / f'model.fold0.h5', gen, batch_size=BATCH_SIZE, multi_gpu=True) pred_list = [] for cv_index in tk.tqdm(range(CV_COUNT), desc='predict'): if cv_index != 0: model.load_weights(MODELS_DIR / f'model.fold{cv_index}.h5') X_t = X_list[cv_index] pred = _evaluation.predict_tta(model, X_t, mode='bin') pred_list.append(pred) if data_name == 'val': pred = np.empty((len(X), 1), dtype=np.float32) for vi, p in zip(vi_list, pred_list): pred[vi] = p else: pred = pred_list cache_path.parent.mkdir(parents=True, exist_ok=True) joblib.dump(pred, cache_path, compress=3) return pred
def _main(): base_dir = pathlib.Path(__file__).resolve().parent vocdevkit_dir = base_dir / 'pytoolkit' / 'data' / 'VOCdevkit' save_dir = base_dir / '___generator_check' save_dir.mkdir(exist_ok=True) X_val, y_val = tk.data.voc.load_07_test(vocdevkit_dir) X_val, y_val = X_val[:1], y_val[:1] gen = tk.dl.od.od_gen.create_generator((512, 512), preprocess_input=lambda x: x, encode_truth=None) g, _ = gen.flow(X_val, y_val, data_augmentation=True) for i, (X_batch, y_batch) in zip(tk.tqdm(range(32)), g): for rgb, y in zip(X_batch, y_batch): img = tk.ml.plot_objects(rgb, y.classes, None, y.bboxes, tk.data.voc.CLASS_NAMES) tk.ndimage.save(save_dir / f'{i}.jpg', img)
def log_evaluation(y_val, pred_val, print_fn=None, search_th=False, threshold=None): """検証結果をログる。""" print_fn = print_fn or tk.log.get(__name__).info # 正解率とか tk.ml.print_classification_metrics(np.ravel(y_val), np.ravel(pred_val), print_fn=print_fn) # 閾値探索&スコア表示 if search_th: assert threshold is None threshold_list = np.linspace(0.3, 0.7, 100) score_list = [] for th in tk.tqdm(threshold_list, desc='threshold'): score = compute_score(np.int32(y_val > 0.5), np.int32(pred_val > th)) score_list.append(score) best_index = np.argmax(score_list) print_fn('scores:') for th, score in zip(threshold_list[::10], score_list[::10]): print_fn(f' threshold={th:.3f}: score={score:.3f}') threshold = threshold_list[best_index] score = score_list[best_index] print_fn(f'max score: {score:.3f} (threshold: {threshold:.3f})') else: if threshold is None: threshold = 0.5 else: assert threshold.shape == (len(pred_val), ) threshold = threshold.reshape((len(pred_val), 1, 1, 1)) print_fn(f'mean threshold: {np.mean(threshold):.3f}') score = compute_score(np.int32(y_val > 0.5), np.int32(pred_val > threshold)) print_fn(f'score: {score:.3f}') # オレオレ指標 print_metrics(y_val > 0.5, pred_val > threshold, print_fn=print_fn) return threshold
def save_submission(save_path, pred): """投稿用ファイルを出力。ついでにちょっとだけ結果を分析。""" # 投稿用ファイルを出力 id_list = pd.read_csv(TEST_PATH)['id'].values pred_dict = { id_: _encode_rl(pred[i]) for i, id_ in enumerate(tk.tqdm(id_list, desc='encode_rl')) } df = pd.DataFrame.from_dict(pred_dict, orient='index') df.index.names = ['id'] df.columns = ['rle_mask'] df.to_csv(str(save_path)) # 結果を分析 pred_bin = np.expand_dims(np.max(pred, axis=(1, 2, 3)).astype(np.uint8), axis=-1) # 0 or 1 empty_count = len(pred_bin) - pred_bin.sum() logger = tk.log.get(__name__) logger.info( f'empty rate: {empty_count}/{len(pred_bin)} = {100 * empty_count / len(pred_bin):.1f}%' )