def __init__( self, image_filenames: List[str], mask_filenames: Optional[List[str]], transform: A.Compose, image_loader=read_inria_image, mask_loader=read_inria_mask, need_weight_mask=False, image_ids=None, make_mask_target_fn: Callable = mask_to_bce_target, ): if mask_filenames is not None and len(image_filenames) != len( mask_filenames): raise ValueError( "Number of images does not corresponds to number of targets") self.image_ids = [ fs.id_from_fname(fname) for fname in image_filenames ] if image_ids is None else image_ids self.need_weight_mask = need_weight_mask self.images = image_filenames self.masks = mask_filenames self.get_image = image_loader self.get_mask = mask_loader self.transform = transform self.make_mask_target_fn = make_mask_target_fn
def __init__(self, image_filenames, target_filenames, image_loader, target_loader, transform=None, keep_in_mem=False): if len(image_filenames) != len(target_filenames): raise ValueError( 'Number of images does not corresponds to number of targets') self.image_ids = [id_from_fname(fname) for fname in image_filenames] if keep_in_mem: self.images = [image_loader(fname) for fname in image_filenames] self.masks = [target_loader(fname) for fname in target_filenames] self.get_image = lambda x: x self.get_loader = lambda x: x else: self.images = image_filenames self.masks = target_filenames self.get_image = image_loader self.get_loader = target_loader self.transform = transform
def test_pseudolabeling_aptos2015_round1(predictions, output_csv): print('Saving pseudolabels to ', output_csv) num_models = len(predictions) ids, x, y_true, y_average = prepare_inference_datasets( predictions, use_features=False, use_predictions=True) for i in range(num_models): print( fs.id_from_fname(predictions[i]), cohen_kappa_score(y_true, regression_to_class(x[:, i]), weights='quadratic')) y_round = to_numpy(regression_to_class(x)) y_major = majority_voting(y_round, axis=1) y_agreement = y_round == np.expand_dims(y_major, -1) # y_agreement_all = np.all(y_agreement, axis=1) # y_agreement_all = np.sum(y_agreement, axis=1) >= 16 y_agreement_all = y_major == y_true print('Agreement', np.mean(y_agreement_all)) print('Distribution', np.bincount(y_major[y_agreement_all])) y_true[~y_agreement_all] = -100 print(y_round) df = pd.DataFrame.from_dict({'id_code': ids, 'diagnosis': y_true}) df.to_csv(output_csv, index=None)
def extract_and_save_dct_jpegio(fname, output_dir): # dct_y, dct_cr, dct_cb = compute_dct_fast(fname) image_id = fs.id_from_fname(fname) + ".npz" method = os.path.split(os.path.split(fname)[0])[1] dct_fname = os.path.join(output_dir, method, image_id) jpegStruct = jpio.read(fname) dct_matrix = jpegStruct.coef_arrays quant_tables = jpegStruct.quant_tables # ci0 = jpegStruct.comp_info[0] # ci1 = jpegStruct.comp_info[1] # ci2 = jpegStruct.comp_info[2] qm0 = np.tile(quant_tables[0], (512 // 8, 512 // 8)) qm1 = np.tile(quant_tables[1], (512 // 8, 512 // 8)) np.savez_compressed( dct_fname, dct_y=(dct_matrix[0] * qm0).astype(np.int16), dct_cb=(dct_matrix[1] * qm1).astype(np.int16), dct_cr=(dct_matrix[2] * qm1).astype(np.int16), qm0=quant_tables[0].astype(np.int16), qm1=quant_tables[1].astype(np.int16), ) del jpegStruct
def cut_dataset_in_patches(data_dir, tile_size, tile_step, image_margin): train_data = [] valid_data = [] # For validation, we remove the first five images of every location (e.g., austin{1-5}.tif, chicago{1-5}.tif) from the training set. # That is suggested validation strategy by competition host for loc in TRAIN_LOCATIONS: for i in range(1, 6): valid_data.append(f"{loc}{i}") for i in range(6, 37): train_data.append(f"{loc}{i}") train_imgs = [os.path.join(data_dir, "train", "images", f"{fname}.tif") for fname in train_data] valid_imgs = [os.path.join(data_dir, "train", "images", f"{fname}.tif") for fname in valid_data] train_masks = [os.path.join(data_dir, "train", "gt", f"{fname}.tif") for fname in train_data] valid_masks = [os.path.join(data_dir, "train", "gt", f"{fname}.tif") for fname in valid_data] images_dir = os.path.join(data_dir, "train_tiles", "images") masks_dir = os.path.join(data_dir, "train_tiles", "gt") df = defaultdict(list) for train_img in tqdm(train_imgs, total=len(train_imgs), desc="train_imgs"): img_tiles = split_image(train_img, images_dir, tile_size, tile_step, image_margin) df["image"].extend(img_tiles) df["train"].extend([1] * len(img_tiles)) df["image_id"].extend([fs.id_from_fname(train_img)] * len(img_tiles)) for train_msk in tqdm(train_masks, total=len(train_masks), desc="train_masks"): msk_tiles = split_image(train_msk, masks_dir, tile_size, tile_step, image_margin) df["mask"].extend(msk_tiles) df["has_buildings"].extend([read_inria_mask(x).any() for x in msk_tiles]) for valid_img in tqdm(valid_imgs, total=len(valid_imgs), desc="valid_imgs"): img_tiles = split_image(valid_img, images_dir, tile_size, tile_size, image_margin) df["image"].extend(img_tiles) df["train"].extend([0] * len(img_tiles)) df["image_id"].extend([fs.id_from_fname(valid_img)] * len(img_tiles)) for valid_msk in tqdm(valid_masks, total=len(valid_masks), desc="valid_masks"): msk_tiles = split_image(valid_msk, masks_dir, tile_size, tile_size, image_margin) df["mask"].extend(msk_tiles) df["has_buildings"].extend([read_inria_mask(x).any() for x in msk_tiles]) return pd.DataFrame.from_dict(df)
def preprocess(image_fname, output_dir, image_size=768): image = cv2.imread(image_fname) image = crop_black(image, tolerance=5) image = longest_max_size(image, max_size=image_size, interpolation=cv2.INTER_CUBIC) image_id = fs.id_from_fname(image_fname) dst_fname = os.path.join(output_dir, image_id + '.png') cv2.imwrite(dst_fname, image) return
def sanitize_fname(x): x = fs.id_from_fname(x) x = (x.replace("fp16", "").replace("fold", "f").replace( "local_rank_0", "").replace("nr_rgb_tf_efficientnet_b6_ns", "").replace("rgb_tf_efficientnet_b2_ns", "").replace( "rgb_tf_efficientnet_b3_ns", "").replace( "rgb_tf_efficientnet_b6_ns", "").replace("rgb_tf_efficientnet_b7_ns", "")) x = re.sub(r"\w{3}\d{2}_\d{2}_\d{2}", "", x).replace("_", "") return x
def test_evaluate_model(predictions): num_models = len(predictions) ids, x, y_true, y_average = prepare_inference_datasets( predictions, use_features=False, use_predictions=True) for i in range(num_models): print( fs.id_from_fname(predictions[i]), cohen_kappa_score(y_true, regression_to_class(x), weights='quadratic'))
def __getitem__(self, i): # read data image = fs.read_rgb_image(self.images_fps[i]) mask = fs.read_image_as_is(self.masks_fps[i]) assert mask.max() < len(CLASSES) # apply augmentations sample = self.transform(image=image, mask=mask) image, mask = sample['image'], sample['mask'] return { "image_id": id_from_fname(self.images_fps[i]), "features": tensor_from_rgb_image(image), "targets": torch.from_numpy(mask).long() }
def split_image(image_fname, output_dir, tile_size, tile_step, image_margin): os.makedirs(output_dir, exist_ok=True) image = read_image_as_is(image_fname) image_id = id_from_fname(image_fname) slicer = ImageSlicer(image.shape, tile_size, tile_step, image_margin) tiles = slicer.split(image) fnames = [] for i, tile in enumerate(tiles): output_fname = os.path.join(output_dir, f"{image_id}_tile_{i}.png") cv2.imwrite(output_fname, tile) fnames.append(output_fname) return fnames
def cut_test_dataset_in_patches(data_dir, tile_size, tile_step, image_margin): train_imgs = fs.find_images_in_dir(os.path.join(data_dir, "test", "images")) images_dir = os.path.join(data_dir, "test_tiles", "images") df = defaultdict(list) for train_img in tqdm(train_imgs, total=len(train_imgs), desc="test_imgs"): img_tiles = split_image(train_img, images_dir, tile_size, tile_step, image_margin) df["image"].extend(img_tiles) df["image_id"].extend([fs.id_from_fname(train_img)] * len(img_tiles)) return pd.DataFrame.from_dict(df)
def __init__( self, image_fname: str, mask_fname: str, image_loader: Callable, target_loader: Callable, tile_size, tile_step, image_margin=0, transform=None, target_shape=None, keep_in_mem=False, ): self.image_fname = image_fname self.mask_fname = mask_fname self.image_loader = image_loader self.mask_loader = target_loader self.image = None self.mask = None if target_shape is None or keep_in_mem: image = image_loader(image_fname) mask = target_loader(mask_fname) if image.shape[0] != mask.shape[0] or image.shape[1] != mask.shape[ 1]: raise ValueError( f"Image size {image.shape} and mask shape {image.shape} must have equal width and height" ) target_shape = image.shape self.slicer = ImageSlicer(target_shape, tile_size, tile_step, image_margin) if keep_in_mem: self.images = self.slicer.split(image) self.masks = self.slicer.split(mask) else: self.images = None self.masks = None self.transform = transform self.image_ids = [ id_from_fname(image_fname) + f" [{crop[0]};{crop[1]};{crop[2]};{crop[3]};]" for crop in self.slicer.crops ]
def __init__( self, image_fname: str, mask_fname: str, image_loader: Callable, target_loader: Callable, tile_size, tile_step, image_margin=0, transform=None, target_shape=None, need_weight_mask=False, keep_in_mem=False, make_mask_target_fn: Callable = mask_to_bce_target, ): self.image_fname = image_fname self.mask_fname = mask_fname self.image_loader = image_loader self.mask_loader = target_loader self.image = None self.mask = None self.need_weight_mask = need_weight_mask if target_shape is None or keep_in_mem: image = image_loader(image_fname) mask = target_loader(mask_fname) if image.shape[0] != mask.shape[0] or image.shape[1] != mask.shape[ 1]: raise ValueError( f"Image size {image.shape} and mask shape {image.shape} must have equal width and height" ) target_shape = image.shape self.slicer = ImageSlicer(target_shape, tile_size, tile_step, image_margin) self.transform = transform self.image_ids = [fs.id_from_fname(image_fname)] * len( self.slicer.crops) self.crop_coords_str = [ f"[{crop[0]};{crop[1]};{crop[2]};{crop[3]};]" for crop in self.slicer.crops ] self.make_mask_target_fn = make_mask_target_fn
def get_pseudolabeling_dataset(data_dir: str, include_masks: bool, image_size=(224, 224), augmentation=None, need_weight_mask=False): images = fs.find_images_in_dir( os.path.join(data_dir, "test_tiles", "images")) masks_dir = os.path.join(data_dir, "test_tiles", "masks") os.makedirs(masks_dir, exist_ok=True) masks = [ os.path.join(masks_dir, fs.id_from_fname(image_fname) + ".png") for image_fname in images ] if augmentation == "hard": transfrom = A.Compose( [crop_transform(image_size, input_size=768), hard_augmentations()]) elif augmentation == "medium": transfrom = A.Compose([ crop_transform(image_size, input_size=768), medium_augmentations() ]) elif augmentation == "light": transfrom = A.Compose([ crop_transform(image_size, input_size=768), light_augmentations() ]) else: transfrom = A.Normalize() return InriaImageMaskDataset( images, masks if include_masks else None, transform=transfrom, image_loader=read_inria_image, mask_loader=read_inria_mask_with_pseudolabel, need_weight_mask=need_weight_mask, )
def __getitem__(self, item): image = cv2.imread(self.images[item]) # Read with OpenCV instead PIL. It's faster if image is None: raise FileNotFoundError(self.images[item]) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) height, width = image.shape[:2] diagnosis = UNLABELED_CLASS if self.targets is not None: diagnosis = self.targets[item] data = self.transform(image=image, diagnosis=diagnosis) diagnosis = data['diagnosis'] data = {'image': tensor_from_rgb_image(data['image']), 'image_id': id_from_fname(self.images[item])} if self.meta_features: log_height = math.log(height) log_width = math.log(width) aspect_ratio = log_height / log_width mean = np.mean(image, axis=(0, 1)) meta_features = np.array([ log_height, log_width, aspect_ratio, mean[0], mean[1], mean[2] ]) data['meta_features'] = meta_features diagnosis = self.dtype(diagnosis) if self.target_as_array: data['targets'] = np.array([diagnosis]) else: data['targets'] = diagnosis return data
def main(): parser = argparse.ArgumentParser() parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() data_dir = args.data_dir cover = os.path.join(data_dir, "Cover") JMiPOD = os.path.join(data_dir, "JMiPOD") JUNIWARD = os.path.join(data_dir, "JUNIWARD") UERD = os.path.join(data_dir, "UERD") dataset = ( fs.find_images_in_dir(cover) + fs.find_images_in_dir(JMiPOD) + fs.find_images_in_dir(JUNIWARD) + fs.find_images_in_dir(UERD) ) # dataset = dataset[:500] df = defaultdict(list) for image_fname in tqdm(dataset): target = target_from_fname(image_fname) dct_fname = fs.change_extension(image_fname, ".npz") dct_data = np.load(dct_fname) qm0 = dct_data["qm0"] qm1 = dct_data["qm1"] qf = quality_factror_from_qm(qm0) fsize = os.stat(image_fname).st_size df["image_id"].append(fs.id_from_fname(image_fname)) df["target"].append(target) df["quality"].append(qf) df["qm0"].append(qm0.flatten().tolist()) df["qm1"].append(qm1.flatten().tolist()) df["file_size"].append(fsize) df = pd.DataFrame.from_dict(df) df.to_csv("dataset_qf_qt.csv", index=False)
def __getitem__(self, item): image = cv2.imread(self.images[item]) # Read with OpenCV instead PIL. It's faster image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) height, width = image.shape[:2] original = self.normalize(image=image)['image'] transformed = self.transform(image=image)['image'] data = {'image': tensor_from_rgb_image(transformed), 'original': tensor_from_rgb_image(original), 'image_id': id_from_fname(self.images[item])} if self.meta_features: log_height = math.log(height) log_width = math.log(width) aspect_ratio = log_height / log_width mean = np.mean(image, axis=(0, 1)) meta_features = np.array([ log_height, log_width, aspect_ratio, mean[0], mean[1], mean[2] ]) data['meta_features'] = meta_features if self.targets is not None: target = self.dtype(self.targets[item]) if self.target_as_array: data['targets'] = np.array([target]) else: data['targets'] = target return data
def test_evaluate_model_v2(train, validation): num_models = len(train) ids, train_x, train_y_true, train_y_average = prepare_inference_datasets( train, use_features=False, use_predictions=True) ids, valid_x, valid_y_true, valid_y_average = prepare_inference_datasets( validation, use_features=False, use_predictions=True) for i in range(num_models): print( fs.id_from_fname(train[i]), cohen_kappa_score(train_y_true, regression_to_class(train_x[:, i]), weights='quadratic'), cohen_kappa_score(train_y_true, regression_to_class(valid_x[:, i]), weights='quadratic'), ) print( 'Averaged', cohen_kappa_score(train_y_true, regression_to_class(train_y_average), weights='quadratic'), cohen_kappa_score(valid_y_true, regression_to_class(valid_y_average), weights='quadratic')) print( 'Median ', cohen_kappa_score(train_y_true, regression_to_class(np.median(train_x, axis=1)), weights='quadratic'), cohen_kappa_score(valid_y_true, regression_to_class(np.median(valid_x, axis=1)), weights='quadratic')) print( 'TrimMean', cohen_kappa_score(train_y_true, regression_to_class( trim_mean(train_x, proportiontocut=0.1, axis=1)), weights='quadratic'), cohen_kappa_score(valid_y_true, regression_to_class( trim_mean(valid_x, proportiontocut=0.1, axis=1)), weights='quadratic')) rounder = OptimizedRounder() rounder.fit(train_y_average, train_y_true) print(rounder.coefficients()) print( 'Optimized', cohen_kappa_score(train_y_true, rounder.predict(train_y_average, rounder.coefficients()), weights='quadratic'), cohen_kappa_score(valid_y_true, rounder.predict(valid_y_average, rounder.coefficients()), weights='quadratic'))
def main(): parser = argparse.ArgumentParser() parser.add_argument("checkpoints", nargs="+") parser.add_argument("-w", "--workers", type=int, default=1, help="") parser.add_argument("-dd", "--data-dir", type=str, default="data", help="Data directory") parser.add_argument("-a", "--activation", type=str, default="pre", help="") args = parser.parse_args() targets = fs.find_in_dir(os.path.join(args.data_dir, "tier3", "masks")) + fs.find_in_dir( os.path.join(args.data_dir, "train", "masks") ) targets_post = dict((fs.id_from_fname(fname), fname) for fname in targets if "_post_" in fname) df = defaultdict(list) current_time = datetime.now().strftime("%b%d_%H_%M") print("Checkpoints ", args.checkpoints) print("Activation ", args.activation) for model_checkpoint in args.checkpoints: model_checkpoint = fs.auto_file(model_checkpoint) predictions_dir = os.path.join( os.path.dirname(model_checkpoint), fs.id_from_fname(model_checkpoint) + "_oof_predictions" ) prediction_files = fs.find_in_dir(predictions_dir) prediction_files_post = dict( (fs.id_from_fname(fname), fname) for fname in prediction_files if "_post_" in fname ) y_true_filenames = [targets_post[image_id_post] for image_id_post in prediction_files_post.keys()] y_pred_filenames = [prediction_files_post[image_id_post] for image_id_post in prediction_files_post.keys()] rounder = OptimizedRounder(workers=args.workers, apply_softmax=args.activation) raw_score, raw_localization_f1, raw_damage_f1, raw_damage_f1s = rounder.predict( y_pred_filenames, y_true_filenames, np.array([1, 1, 1, 1, 1], dtype=np.float32) ) rounder.fit(y_pred_filenames, y_true_filenames) score, localization_f1, damage_f1, damage_f1s = rounder.predict( y_pred_filenames, y_true_filenames, rounder.coefficients() ) print(rounder.coefficients()) df["checkpoint"].append(fs.id_from_fname(model_checkpoint)) df["coefficients"].append(rounder.coefficients()) df["samples"].append(len(y_true_filenames)) df["raw_score"].append(raw_score) df["raw_localization"].append(raw_localization_f1) df["raw_damage"].append(raw_damage_f1) df["opt_score"].append(score) df["opt_localization"].append(localization_f1) df["opt_damage"].append(damage_f1) dataframe = pd.DataFrame.from_dict(df) dataframe.to_csv(f"optimized_weights_{current_time}.csv", index=None) print(df)
def convert_dir(df: pd.DataFrame, dir) -> pd.DataFrame: crops_dir = os.path.join(dir, "crops") os.makedirs(crops_dir, exist_ok=True) building_crops = [] global_crop_index = 0 for i, row in tqdm(df.iterrows(), total=len(df)): image_fname_pre = read_image(os.path.join(dir, row["image_fname_pre"])) image_fname_post = read_image( os.path.join(dir, row["image_fname_post"])) mask_fname_post = row["mask_fname_post"] json_fname_post = fs.change_extension( mask_fname_post.replace("masks", "labels"), ".json") inference_data = open_json(os.path.join(dir, json_fname_post)) instance_image, labels = create_instance_image(inference_data) for label_index, damage_label in zip( range(1, instance_image.max() + 1), labels): try: instance_mask = instance_image == label_index rmin, rmax, cmin, cmax = bbox1(instance_mask) max_size = max(rmax - rmin, cmax - cmin) if max_size < 16: print("Skipping crop since it's too small", fs.id_from_fname(mask_fname_post), "label_index", label_index, "min_size", max_size) continue rpadding = (rmax - rmin) // 4 cpadding = (cmax - cmin) // 4 pre_crop = image_fname_pre[max(0, rmin - rpadding):rmax + rpadding, max(0, cmin - cpadding):cmax + cpadding] post_crop = image_fname_post[max(0, rmin - rpadding):rmax + rpadding, max(0, cmin - cpadding):cmax + cpadding] image_id_pre = row["image_id_pre"] image_id_post = row["image_id_post"] pre_crop_fname = f"{global_crop_index:06}_{image_id_pre}.png" post_crop_fname = f"{global_crop_index:06}_{image_id_post}.png" global_crop_index += 1 cv2.imwrite(os.path.join(crops_dir, pre_crop_fname), pre_crop) cv2.imwrite(os.path.join(crops_dir, post_crop_fname), post_crop) building_crops.append({ "pre_crop_fname": pre_crop_fname, "post_crop": post_crop_fname, "label": damage_label, "event_name": row["event_name_post"], "fold": row["fold_post"], "rmin": rmin, "rmax": rmax, "cmin": cmin, "cmax": cmax, "max_size": max_size, "rpadding": rpadding, "cpadding": cpadding }) except Exception as e: print(e) print(mask_fname_post) df = pd.DataFrame.from_records(building_crops) return df
def main(): parser = argparse.ArgumentParser() parser.add_argument("experiments", nargs="+", type=str) parser.add_argument("-o", "--output", type=str, required=False) parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) args = parser.parse_args() output_dir = os.path.dirname(__file__) data_dir = args.data_dir experiments = args.experiments output_file = args.output holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids_h = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) with_logits = True x, y = get_x_y_for_stacking(holdout_predictions, with_logits=with_logits, tta_logits=with_logits) # Force target to be binary y = (y > 0).astype(int) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=with_logits, tta_logits=with_logits) print(x_test.shape) if False: image_fnames_h = [ os.path.join(data_dir, INDEX_TO_METHOD[method], f"{image_id}.jpg") for (image_id, method) in zip(image_ids_h, y) ] test_image_ids = pd.read_csv(test_predictions[0]).image_id.tolist() image_fnames_t = [ os.path.join(data_dir, "Test", image_id) for image_id in test_image_ids ] entropy_t = compute_image_features(image_fnames_t) x_test = np.column_stack([x_test, entropy_t]) # entropy_h = entropy_t.copy() # x = x_test.copy() entropy_h = compute_image_features(image_fnames_h) x = np.column_stack([x, entropy_h]) print("Added image features", entropy_h.shape, entropy_t.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) params = { "min_child_weight": [1, 5, 10], "gamma": [1e-3, 1e-2, 1e-2, 0.5, 2], "subsample": [0.6, 0.8, 1.0], "colsample_bytree": [0.6, 0.8, 1.0], "max_depth": [2, 3, 4, 5, 6], "n_estimators": [16, 32, 64, 128, 256, 1000], "learning_rate": [0.001, 0.01, 0.05, 0.2, 1], } xgb = XGBClassifier(objective="binary:logistic", nthread=1) random_search = RandomizedSearchCV( xgb, param_distributions=params, scoring=make_scorer(alaska_weighted_auc, greater_is_better=True, needs_proba=True), n_jobs=4, n_iter=25, cv=group_kfold.split(x, y, groups=image_ids_h), verbose=3, random_state=42, ) # Here we go random_search.fit(x, y) print("\n All results:") print(random_search.cv_results_) print("\n Best estimator:") print(random_search.best_estimator_) print(random_search.best_score_) print("\n Best hyperparameters:") print(random_search.best_params_) results = pd.DataFrame(random_search.cv_results_) results.to_csv("xgb-random-grid-search-results-01.csv", index=False) test_pred = random_search.predict_proba(x_test)[:, 1] if output_file is None: with_logits_sfx = "_with_logits" if with_logits else "" submit_fname = os.path.join( output_dir, f"xgb_cls_gs_{random_search.best_score_:.4f}_{checksum}{with_logits_sfx}.csv" ) else: submit_fname = output_file df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname) import json with open(fs.change_extension(submit_fname, ".json"), "w") as f: json.dump(random_search.best_params_, f, indent=2)
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") fnames_for_checksum = [x + f"cauc" for x in experiments] checksum = compute_checksum_v2(fnames_for_checksum) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] print("Unique image ids", len(np.unique(image_ids))) quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y(holdout_predictions) print(x.shape, y.shape) x_test, _ = get_x_y(test_predictions) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) test_dmatrix = xgb.DMatrix(x_test) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits params = { "base_score": 0.5, "booster": "gblinear", # "booster": "gbtree", "colsample_bylevel": 1, "colsample_bynode": 1, "colsample_bytree": 1, # "gamma": 1.0, "learning_rate": 0.01, "max_delta_step": 0, "objective": "binary:logistic", "eta": 0.1, "reg_lambda": 0, "subsample": 0.8, "scale_pos_weight": 1, "min_child_weight": 2, "max_depth": 5, "tree_method": "exact", "seed": 42, "alpha": 0.01, "lambda": 0.01, "n_estimators": 256, "gamma": 0.01, "disable_default_eval_metric": 1, # "eval_metric": "wauc", } for fold_index, (train_index, valid_index) in enumerate( group_kfold.split(x, y, groups=image_ids)): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) train_dmatrix = xgb.DMatrix(x_train.copy(), y_train.copy()) valid_dmatrix = xgb.DMatrix(x_valid.copy(), y_valid.copy()) xgb_model = xgb.train( params, train_dmatrix, num_boost_round=5000, verbose_eval=True, feval=xgb_weighted_auc, maximize=True, evals=[(valid_dmatrix, "validation")], ) y_valid_pred = xgb_model.predict(valid_dmatrix) score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += xgb_model.predict(test_dmatrix) * one_over_n else: test_pred = xgb_model.predict(test_dmatrix) * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) submit_fname = os.path.join( output_dir, f"xgb_{np.mean(cv_scores):.4f}_{checksum}_.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def evaluate_generalization(checkpoints, num_folds=4): num_datasets = len(checkpoints) # kappa_matrix = np.zeros((num_datasets, num_datasets), dtype=np.float32) class_names = list(checkpoints.keys()) # results = {} for dataset_trained_on, checkpoints_per_fold in checkpoints.items(): # For each dataset trained on for fold_trained_on, checkpoint_file in enumerate( checkpoints_per_fold): # For each checkpoint if checkpoint_file is None: continue # Load model checkpoint = torch.load(checkpoint_file) model_name = checkpoint['checkpoint_data']['cmd_args']['model'] batch_size = 16 # checkpoint['checkpoint_data']['cmd_args']['batch_size'] num_classes = len(get_class_names()) model = get_model(model_name, pretrained=False, num_classes=num_classes) model.load_state_dict(checkpoint['model_state_dict']) model = model.eval().cuda() if torch.cuda.device_count() > 1: model = nn.DataParallel( model, device_ids=[id for id in range(torch.cuda.device_count())]) for dataset_index, dataset_validate_on in enumerate(class_names): # For each available dataset for fold_validate_on in range(num_folds): _, valid_ds, _ = get_datasets( use_aptos2015=dataset_validate_on == 'aptos2015', use_aptos2019=dataset_validate_on == 'aptos2019', use_messidor=dataset_validate_on == 'messidor', use_idrid=dataset_validate_on == 'idrid', fold=fold_validate_on, folds=num_folds) data_loader = DataLoader(valid_ds, batch_size * torch.cuda.device_count(), pin_memory=True, num_workers=8) predictions = defaultdict(list) for batch in tqdm( data_loader, desc= f'Evaluating {dataset_validate_on} fold {fold_validate_on} on {checkpoint_file}' ): input = batch['image'].cuda(non_blocking=True) outputs = model(input) logits = to_numpy(outputs['logits'].softmax(dim=1)) regression = to_numpy(outputs['regression']) features = to_numpy(outputs['features']) predictions['image_id'].extend(batch['image_id']) predictions['diagnosis_true'].extend( to_numpy(batch['targets'])) predictions['logits'].extend(logits) predictions['regression'].extend(regression) predictions['features'].extend(features) pickle_name = id_from_fname( checkpoint_file ) + f'_on_{dataset_validate_on}_fold{fold_validate_on}.pkl' df = pd.DataFrame.from_dict(predictions) df.to_pickle(pickle_name)
def model_from_checkpoint(model_checkpoint: str, tta: Optional[str] = None, activation_after="model", model=None, report=True, classifiers=True) -> Tuple[nn.Module, Dict]: checkpoint = torch.load(model_checkpoint, map_location="cpu") model_name = model or checkpoint["checkpoint_data"]["cmd_args"]["model"] score = float(checkpoint["epoch_metrics"]["valid"]["weighted_f1"]) loc = float( checkpoint["epoch_metrics"]["valid"]["weighted_f1/localization_f1"]) dmg = float(checkpoint["epoch_metrics"]["valid"]["weighted_f1/damage_f1"]) fold = int(checkpoint["checkpoint_data"]["cmd_args"]["fold"]) if report: print(model_checkpoint, model_name) report_checkpoint(checkpoint) model = get_model(model_name, pretrained=False, classifiers=classifiers) model.load_state_dict(checkpoint["model_state_dict"], strict=False) del checkpoint if activation_after == "model": model = ApplySoftmaxTo(model, OUTPUT_MASK_KEY) if tta == "multiscale": print(f"Using {tta}") model = MultiscaleTTA(model, outputs=[OUTPUT_MASK_KEY], size_offsets=[-256, -128, +128, +256], average=True) if tta == "flip": print(f"Using {tta}") model = HFlipTTA(model, outputs=[OUTPUT_MASK_KEY], average=True) if tta == "flipscale": print(f"Using {tta}") model = HFlipTTA(model, outputs=[OUTPUT_MASK_KEY], average=True) model = MultiscaleTTA(model, outputs=[OUTPUT_MASK_KEY], size_offsets=[-256, -128, +128, +256], average=True) if tta == "multiscale_d4": print(f"Using {tta}") model = D4TTA(model, outputs=[OUTPUT_MASK_KEY], average=True) model = MultiscaleTTA(model, outputs=[OUTPUT_MASK_KEY], size_offsets=[-256, -128, +128, +256], average=True) if activation_after == "tta": model = ApplySoftmaxTo(model, OUTPUT_MASK_KEY) info = { "model": fs.id_from_fname(model_checkpoint), "model_name": model_name, "fold": fold, "score": score, "localization": loc, "damage": dmg, } return model, info
import os import numpy as np from pytorch_toolbelt.utils import fs from scipy.stats import spearmanr import matplotlib.pyplot as plt from sklearn.metrics import ConfusionMatrixDisplay import pandas as pd submissions = [x for x in fs.find_in_dir(".") if str.endswith(x, ".csv")] names = list(map(lambda x: fs.id_from_fname(x)[:32], submissions)) submissions = [pd.read_csv(x).sort_values(by="Id").reset_index() for x in submissions] cm = np.zeros((len(submissions), len(submissions))) for i in range(len(submissions)): for j in range(len(submissions)): cm[i, j] = spearmanr(submissions[i].Label, submissions[j].Label).correlation print(cm) plt.figure(figsize=(10 + len(submissions), 10 + len(submissions))) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=names) disp.plot(include_values=True, cmap="Blues", ax=plt.gca(), xticks_rotation="45") plt.tight_layout() plt.show()
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_for_stacking(holdout_predictions, with_logits=True, tta_logits=True) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=True, tta_logits=True) print(x_test.shape) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) for fold_index, (train_index, valid_index) in enumerate( group_kfold.split(x, y, groups=image_ids)): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) clf = LazyClassifier(verbose=True, ignore_warnings=False, custom_metric=alaska_weighted_auc, predictions=True) models, predictions = clf.fit(x_train, x_valid, y_train, y_valid) print(models) models.to_csv( os.path.join(output_dir, f"lazypredict_models_{fold_index}_{checksum}.csv")) predictions.to_csv( os.path.join(output_dir, f"lazypredict_preds_{fold_index}_{checksum}.csv"))
def main(): output_dir = os.path.dirname(__file__) experiments = [ # "A_May24_11_08_ela_skresnext50_32x4d_fold0_fp16", # "A_May15_17_03_ela_skresnext50_32x4d_fold1_fp16", # "A_May21_13_28_ela_skresnext50_32x4d_fold2_fp16", # "A_May26_12_58_ela_skresnext50_32x4d_fold3_fp16", # # "B_Jun05_08_49_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "B_Jun09_16_38_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", # "B_Jun11_08_51_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", # "B_Jun11_18_38_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # # "C_Jun24_22_00_rgb_tf_efficientnet_b2_ns_fold2_local_rank_0_fp16", # # "D_Jun18_16_07_rgb_tf_efficientnet_b7_ns_fold1_local_rank_0_fp16", # "D_Jun20_09_52_rgb_tf_efficientnet_b7_ns_fold2_local_rank_0_fp16", # # "E_Jun18_19_24_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", # "E_Jun21_10_48_rgb_tf_efficientnet_b6_ns_fold0_istego100k_local_rank_0_fp16", # # "F_Jun29_19_43_rgb_tf_efficientnet_b3_ns_fold0_local_rank_0_fp16", # "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) x, y = get_x_y_for_stacking(holdout_predictions, with_logits=True, tta_logits=True) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=True, tta_logits=True) print(x_test.shape) if False: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits for train_index, valid_index in group_kfold.split(x, y, groups=image_ids): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) print(np.bincount(y_train), np.bincount(y_valid)) # cls = LinearDiscriminantAnalysis() cls = LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto", priors=[0.5, 0.5]) cls.fit(x_train, y_train) y_valid_pred = cls.predict_proba(x_valid)[:, 1] score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n else: test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) submit_fname = os.path.join( output_dir, f"lda_{np.mean(cv_scores):.4f}_{checksum}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): output_dir = os.path.dirname(__file__) experiments = [ "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16" # # ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) with_logits = True x, y = get_x_y_for_stacking(holdout_predictions, with_logits=with_logits, tta_logits=with_logits) # Force target to be binary y = (y > 0).astype(int) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=with_logits, tta_logits=with_logits) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) cv_scores = [] test_pred = None one_over_n = 1.0 / group_kfold.n_splits for train_index, valid_index in group_kfold.split(x, y, groups=image_ids): x_train, x_valid, y_train, y_valid = (x[train_index], x[valid_index], y[train_index], y[valid_index]) print(np.bincount(y_train), np.bincount(y_valid)) cls = XGBClassifier( base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6, gamma=0.5, gpu_id=-1, importance_type="gain", interaction_constraints="", learning_rate=0.01, max_delta_step=0, max_depth=3, min_child_weight=10, # missing=nan, monotone_constraints="()", n_estimators=1000, n_jobs=8, nthread=1, num_parallel_tree=1, objective="binary:logistic", random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=0.8, tree_method="exact", validate_parameters=1, verbosity=2, ) cls.fit(x_train, y_train) y_valid_pred = cls.predict_proba(x_valid)[:, 1] score = alaska_weighted_auc(y_valid, y_valid_pred) cv_scores.append(score) if test_pred is not None: test_pred += cls.predict_proba(x_test)[:, 1] * one_over_n else: test_pred = cls.predict_proba(x_test)[:, 1] * one_over_n for s in cv_scores: print(s) print(np.mean(cv_scores), np.std(cv_scores)) with_logits_sfx = "_with_logits" if with_logits else "" submit_fname = os.path.join( output_dir, f"xgb_cls_{np.mean(cv_scores):.4f}_{checksum}{with_logits_sfx}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("Saved submission to ", submit_fname)
def main(): parser = argparse.ArgumentParser() parser.add_argument("predictions", nargs="+") parser.add_argument("-w", "--workers", type=int, default=0, help="") parser.add_argument("-dd", "--data-dir", type=str, default="data", help="Data directory") args = parser.parse_args() targets = fs.find_in_dir(os.path.join( args.data_dir, "tier3", "masks")) + fs.find_in_dir( os.path.join(args.data_dir, "train", "masks")) targets_post = dict((fs.id_from_fname(fname), fname) for fname in targets if "_post_" in fname) df = defaultdict(list) postprocessings = { "naive": make_predictions_naive, "dominant": make_predictions_dominant, "floodfill": make_predictions_floodfill, } for predictions_dir in args.predictions: try: prediction_files = fs.find_in_dir(predictions_dir) prediction_files_post = dict((fs.id_from_fname(fname), fname) for fname in prediction_files if "_post_" in fname) y_true_filenames = [ targets_post[image_id_post] for image_id_post in prediction_files_post.keys() ] y_pred_filenames = [ prediction_files_post[image_id_post] for image_id_post in prediction_files_post.keys() ] for name, fn in postprocessings.items(): score, localization_f1, damage_f1, damage_f1s = optimize_postprocessing( y_pred_filenames, y_true_filenames, postprocessing_fn=fn, workers=args.workers) print(name, score) df["samples"].append(len(y_pred_filenames)) df["predictions_dir"].append(predictions_dir) df["postprocessing"].append(name) df["score"].append(score) df["localization_f1"].append(localization_f1) df["damage_f1"].append(damage_f1) except Exception as e: print("Failed to process", predictions_dir, e) df = pd.DataFrame.from_dict(df) print(df) current_time = datetime.now().strftime("%b%d_%H_%M") df.to_csv(f"postprocessing_eval_{current_time}.csv", index=None)
def main(): output_dir = os.path.dirname(__file__) experiments = [ "G_Jul03_21_14_nr_rgb_tf_efficientnet_b6_ns_fold0_local_rank_0_fp16", "G_Jul05_00_24_nr_rgb_tf_efficientnet_b6_ns_fold1_local_rank_0_fp16", "G_Jul06_03_39_nr_rgb_tf_efficientnet_b6_ns_fold2_local_rank_0_fp16", "G_Jul07_06_38_nr_rgb_tf_efficientnet_b6_ns_fold3_local_rank_0_fp16", # "H_Jul12_18_42_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", # "K_Jul17_17_09_nr_rgb_tf_efficientnet_b6_ns_mish_fold0_local_rank_0_fp16", "J_Jul19_20_10_nr_rgb_tf_efficientnet_b7_ns_mish_fold1_local_rank_0_fp16", "H_Jul11_16_37_nr_rgb_tf_efficientnet_b7_ns_mish_fold2_local_rank_0_fp16", "K_Jul18_16_41_nr_rgb_tf_efficientnet_b6_ns_mish_fold3_local_rank_0_fp16" # # ] holdout_predictions = get_predictions_csv(experiments, "cauc", "holdout", "d4") test_predictions = get_predictions_csv(experiments, "cauc", "test", "d4") checksum = compute_checksum_v2(experiments) holdout_ds = get_holdout("", features=[INPUT_IMAGE_KEY]) image_ids = [fs.id_from_fname(x) for x in holdout_ds.images] quality_h = F.one_hot(torch.tensor(holdout_ds.quality).long(), 3).numpy().astype(np.float32) test_ds = get_test_dataset("", features=[INPUT_IMAGE_KEY]) quality_t = F.one_hot(torch.tensor(test_ds.quality).long(), 3).numpy().astype(np.float32) with_logits = True x, y = get_x_y_for_stacking(holdout_predictions, with_logits=with_logits, tta_logits=with_logits) # Force target to be binary y = (y > 0).astype(int) print(x.shape, y.shape) x_test, _ = get_x_y_for_stacking(test_predictions, with_logits=with_logits, tta_logits=with_logits) print(x_test.shape) if True: sc = StandardScaler() x = sc.fit_transform(x) x_test = sc.transform(x_test) if False: sc = PCA(n_components=16) x = sc.fit_transform(x) x_test = sc.transform(x_test) if True: x = np.column_stack([x, quality_h]) x_test = np.column_stack([x_test, quality_t]) group_kfold = GroupKFold(n_splits=5) params = { "boosting_type": ["gbdt", "dart", "rf", "goss"], "num_leaves": [16, 32, 64, 128], "reg_alpha": [0, 0.01, 0.1, 0.5], "reg_lambda": [0, 0.01, 0.1, 0.5], "learning_rate": [0.001, 0.01, 0.1, 0.5], "n_estimators": [32, 64, 126, 512], "max_depth": [2, 4, 8], "min_child_samples": [20, 40, 80, 100], } lgb_estimator = lgb.LGBMClassifier(objective="binary", silent=True) random_search = RandomizedSearchCV( lgb_estimator, param_distributions=params, scoring=make_scorer(alaska_weighted_auc, greater_is_better=True, needs_proba=True), n_jobs=3, n_iter=50, cv=group_kfold.split(x, y, groups=image_ids), verbose=2, random_state=42, ) # Here we go random_search.fit(x, y) test_pred = random_search.predict_proba(x_test)[:, 1] print(test_pred) submit_fname = os.path.join( output_dir, f"lgbm_gs_{random_search.best_score_:.4f}_{checksum}.csv") df = pd.read_csv(test_predictions[0]).rename(columns={"image_id": "Id"}) df["Label"] = test_pred df[["Id", "Label"]].to_csv(submit_fname, index=False) print("\n All results:") print(random_search.cv_results_) print("\n Best estimator:") print(random_search.best_estimator_) print(random_search.best_score_) print("\n Best hyperparameters:") print(random_search.best_params_) results = pd.DataFrame(random_search.cv_results_) results.to_csv("lgbm-random-grid-search-results-01.csv", index=False)