def search_blending_weight( predictions: List[np.ndarray], target: np.ndarray, n_iter: int, func: Callable[[np.ndarray, np.ndarray], float] = rmse, is_higher_better: bool = False, ) -> np.ndarray: best_weights = np.zeros(len(predictions)) best_score = -np.inf if is_higher_better else np.inf for i in range(n_iter): seed_everything(i) dice = np.random.rand(len(predictions)) weights = dice / dice.sum() blended = np.zeros(len(predictions[0])) for weight, pred in zip(weights, predictions): blended += weight * pred score = func(blended, target) if is_higher_better: if score > best_score: best_score = score best_weights = weights else: if score < best_score: best_score = score best_weights = weights return best_score, best_weights
def run_baseline(): time_experiment = datetime.now().strftime("%m%d%Y_%H%M") seed_everything(0) ds = Dataset() ds.load_dataset() from sklearn.preprocessing import LabelEncoder nan_constant = -999 for col, col_type in ds.X_train.dtypes.iteritems(): if col_type == "object": ds.X_train[col] = ds.X_train[col].fillna(nan_constant) ds.X_test[col] = ds.X_test[col].fillna(nan_constant) lbl = LabelEncoder() lbl.fit(list(ds.X_train[col].values) + list(ds.X_test[col].values)) ds.X_train[col] = lbl.transform(list(ds.X_train[col].values)) ds.X_test[col] = lbl.transform(list(ds.X_test[col].values)) if nan_constant in lbl.classes_: nan_transformed = lbl.transform([nan_constant])[0] ds.X_train.loc[ds.X_train[col] == nan_transformed, col] = np.nan ds.X_test.loc[ds.X_test[col] == nan_transformed, col] = np.nan if col in ds.categorical_cols: ds.X_train[col] = ds.X_train[col].fillna(-1).astype("category") ds.X_test[col] = ds.X_test[col].fillna(-1).astype("category") lgb_params = { "n_estimators": 50000, "early_stopping_rounds": 200, "num_leaves": 256, "learning_rate": 0.03, "max_depth": 9, "objective": "binary", "metric": "auc", "subsample": 0.9, "colsample_bytree": 0.9, "scale_pos_weight": 5.5, # mimic a fraud rate ~20% - 0.2*N_legit/N_fraud "boosting_type": "gbdt", "seed": 1337, "n_jobs": -1, "verbosity": -1, } folds = KFold(n_splits=5, random_state=0, shuffle=False) result = run_train_predict(ds, clf_lgb, lgb_params, folds, None) path_to_preds = f"baseline_lgb_{time_experiment}" ds.submission["isFraud"] = result["prediction"] ds.write_submission(path_to_preds)
def main(cfg: DictConfig): print('VinBigData Training Classification') cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Config ------------------------------------------------------------------- data_dir = cfg.data.data_dir seed_everything(cfg.data.seed) load_dotenv('.env') wandb.login() wandb_logger = WandbLogger(project='VinBigData-Classification', reinit=True) wandb_logger.log_hyperparams(dict(cfg.data)) wandb_logger.log_hyperparams(dict(cfg.train)) wandb_logger.log_hyperparams(dict(cfg.aug_kwargs_classification)) # Data Module ------------------------------------------------------------------- transform = ImageTransform(cfg, type='classification') cv = StratifiedKFold(n_splits=cfg.data.n_splits) dm = ChestXrayDataModule(data_dir, cfg, transform, cv, data_type='classification', sample=False) # Model ----------------------------------------------------------- net = Timm_model(cfg.train.backbone, out_dim=1) # Loss fn ----------------------------------------------------------- criterion = nn.BCEWithLogitsLoss() # Optimizer, Scheduler ----------------------------------------------------------- optimizer = optim.Adam(net.parameters(), lr=cfg.train.lr) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.train.epoch, eta_min=0) # Lightning Module model = XrayLightningClassification(net, cfg, criterion, optimizer, scheduler) # Trainer -------------------------------------------------------------------------- trainer = Trainer( logger=wandb_logger, log_every_n_steps=100, max_epochs=cfg.train.epoch, gpus=-1, num_sanity_val_steps=0, # deterministic=True, amp_level='O2', amp_backend='apex' ) # Train trainer.fit(model, datamodule=dm) # Stop Logging wandb.finish() for p in model.weight_paths: os.remove(p)
"--accumulation", type=int, default=1, help="Number of accumulation steps.", ) parser.add_argument("--warmup", type=int, default=3, help="Number of warmup epochs.") parser.add_argument("--out_suff", type=str, help="Suffix added to the saved model name.") args = parser.parse_args() seed_everything(13) writer = SummaryWriter("runs/test_run_logP_warmup15") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = torch.device("cpu") print("Using device:", device) print() # Additional info when using cuda if device.type == "cuda": print(torch.cuda.get_device_name(0)) print("Memory Usage:") print( "Allocated:", round(torch.cuda.memory_allocated(0) / 1024**3, 1), "GB",
if __name__ == "__main__": sys.path.append("./") pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed) cp.cuda.set_allocator(pool.malloc) warnings.filterwarnings("ignore") parser = get_preprocess_parser() args = parser.parse_args() config = load_config(args.config) configure_logger(args.config, log_dir=args.log_dir, debug=args.debug) seed_everything(config["seed_everything"]) logging.info(f"config: {args.config}") logging.info(f"debug: {args.debug}") config["args"] = dict() config["args"]["config"] = args.config # make output dir output_root_dir = Path(config["output_dir"]) feature_dir = Path(config["dataset"]["feature_dir"]) config_name = args.config.split("/")[-1].replace(".yml", "") output_dir = output_root_dir / config_name output_dir.mkdir(parents=True, exist_ok=True)
def main(cfg: DictConfig): cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Random Seed seed_everything(cfg.train.seed) # Model #################################################################### net = ENet(model_name=cfg.train.model_name) transform = ImageTransform(img_size=cfg.data.img_size) # Comet.ml experiment = Experiment(api_key=cfg.comet_ml.api_key, project_name=cfg.comet_ml.project_name) # Log Parameters experiment.log_parameters(dict(cfg.exp)) experiment.log_parameters(dict(cfg.data)) experiment.log_parameters(dict(cfg.train)) # Log Model Graph experiment.set_model_graph(str(net)) # Lightning Module ######################################################### model = LightningSystem(net, cfg, experiment) datamodule = DataModule(data_dir, cfg, transform, cv) checkpoint_callback = ModelCheckpoint(filepath='./checkpoint', save_top_k=1, verbose=True, monitor='avg_val_loss', mode='min', prefix=cfg.exp.exp_name + '_') trainer = Trainer(logger=False, max_epochs=cfg.train.epoch, checkpoint_callback=checkpoint_callback, gpus=1) # Train & Test ############################################################ # Train trainer.fit(model, datamodule=datamodule) experiment.log_metric('best_auc', model.best_auc) checkpoint_path = glob.glob(f'./checkpoint/{cfg.exp.exp_name}_*.ckpt')[0] experiment.log_asset(file_data=checkpoint_path) # Test for i in range(test_num): trainer.test(model) # Submit sub_list = glob.glob(f'submission_{cfg.exp.exp_name}*.csv') _ = summarize_submit(sub_list, experiment, filename=f'sub_{cfg.exp.exp_name}.csv') # oof oof_dataset = datamodule.oof_dataset oof_dataloader = DataLoader(oof_dataset, batch_size=cfg.train.batch_size, pin_memory=False, shuffle=False, drop_last=False) for i in range(10): trainer.test(model, test_dataloaders=oof_dataloader) # Submit sub_list = glob.glob('submission*.csv') _ = summarize_submit(sub_list, experiment, filename=f'oof_{cfg.exp.exp_name}.csv')
def main(cfg: DictConfig): cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) seed_everything(cfg.data.seed) # wandb wandb.init(project='VinBigData-Detection') wandb.config.update(dict(cfg.data)) wandb.config.update(dict(cfg.train)) wandb.config.update(dict(cfg.aug_kwargs_detection)) wandb.config.update(dict(cfg.classification_kwargs)) # omegaconf -> dict rep_aug_kwargs = OmegaConf.to_container(cfg.aug_kwargs_detection) class_name_dict = { 0: 'Aortic enlargement', 1: 'Atelectasis', 2: 'Calcification', 3: 'Cardiomegaly', 4: 'Consolidation', 5: 'ILD', 6: 'Infiltration', 7: 'Lung Opacity', 8: 'Nodule/Mass', 9: 'Other lesion', 10: 'Pleural effusion', 11: 'Pleural thickening', 12: 'Pneumothorax', 13: 'Pulmonary fibrosis', } # Setting -------------------------------------------------- data_dir = cfg.data.data_dir output_dir = cfg.data.output_dir img_size = cfg.data.img_size backbone = cfg.data.backbone use_class14 = cfg.data.use_class14 if os.path.exists(output_dir): shutil.rmtree(output_dir) if use_class14: class_name_dict.update({14: 'No finding'}) # Register Dataset -------------------------------------------------- anno_df = pd.read_csv(os.path.join(data_dir, 'train_wbf_th0.7.csv')) if cfg.data.use_class14: pass else: anno_df = anno_df[anno_df['class_id'] != 14].reset_index(drop=True) # Extract rad id if cfg.data.rad_id != 'all': anno_df = anno_df[anno_df['rad_id'].isin(cfg.data.rad_id)].reset_index() if debug: anno_df = anno_df.head(100) # Split train, valid data - random if 'valid' in cfg.data.split_method: split_rate = float(cfg.data.split_method.split('_')[1]) / 100 unique_image_ids = anno_df['image_id'].values unique_image_ids = np.random.RandomState(cfg.data.seed).permutation(unique_image_ids) train_image_ids = unique_image_ids[:int(len(unique_image_ids) * (1 - split_rate))] valid_image_ids = unique_image_ids[int(len(unique_image_ids) * (1 - split_rate)):] DatasetCatalog.register("xray_valid", lambda d='valid': get_xray_dict(anno_df, data_dir, cfg, valid_image_ids)) MetadataCatalog.get("xray_valid").set(thing_classes=list(class_name_dict.values())) else: train_image_ids = anno_df['image_id'].values DatasetCatalog.register("xray_train", lambda d='train': get_xray_dict(anno_df, data_dir, cfg, train_image_ids)) MetadataCatalog.get("xray_train").set(thing_classes=list(class_name_dict.values())) DatasetCatalog.register("xray_test", lambda d='test': get_test_xray_dict(data_dir)) MetadataCatalog.get("xray_test").set(thing_classes=list(class_name_dict.values())) # Config -------------------------------------------------- detectron2_cfg = get_cfg() detectron2_cfg.aug_kwargs = CN(rep_aug_kwargs) detectron2_cfg.merge_from_file(model_zoo.get_config_file(backbone)) detectron2_cfg.DATASETS.TRAIN = ("xray_train",) if 'valid' in cfg.data.split_method: detectron2_cfg.DATASETS.TEST = ("xray_valid",) detectron2_cfg.TEST.EVAL_PERIOD = cfg.train.max_iter // 10 else: detectron2_cfg.DATASETS.TEST = () detectron2_cfg.INPUT.MIN_SIZE_TRAIN = (img_size,) detectron2_cfg.INPUT.MAX_SIZE_TRAIN = img_size detectron2_cfg.DATALOADER.NUM_WORKERS = cfg.train.num_workers detectron2_cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(backbone) detectron2_cfg.SOLVER.IMS_PER_BATCH = cfg.train.ims_per_batch detectron2_cfg.SOLVER.BASE_LR = cfg.train.lr detectron2_cfg.SOLVER.MAX_ITER = cfg.train.max_iter detectron2_cfg.SOLVER.LR_SCHEDULER_NAME = "WarmupCosineLR" detectron2_cfg.SOLVER.WARMUP_ITERS = 2000 detectron2_cfg.SOLVER.CHECKPOINT_PERIOD = 200000 detectron2_cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = cfg.train.batch_size_per_image detectron2_cfg.MODEL.ROI_HEADS.NUM_CLASSES = 15 if use_class14 else 14 detectron2_cfg.OUTPUT_DIR = output_dir detectron2_cfg.SEED = cfg.data.seed detectron2_cfg.PIXEL_MEAN = [103.530, 116.280, 123.675] detectron2_cfg.PIXEL_STD = [1.0, 1.0, 1.0] # Train -------------------------------------------------- os.makedirs(detectron2_cfg.OUTPUT_DIR, exist_ok=True) # trainer = DefaultTrainer(detectron2_cfg) trainer = MyTrainer(detectron2_cfg) trainer.resume_or_load(resume=True) trainer.train() # Rename Last Weight renamed_model = f"{backbone.split('.')[0].replace('/', '-')}.pth" os.rename(os.path.join(cfg.data.output_dir, 'model_final.pth'), os.path.join(cfg.data.output_dir, renamed_model)) # Logging for model_path in glob.glob(os.path.join(cfg.data.output_dir, '*.pth')): wandb.save(model_path) # Inference Setting ------------------------------------------------------ detectron2_cfg = get_cfg() detectron2_cfg.merge_from_file(model_zoo.get_config_file(backbone)) detectron2_cfg.MODEL.ROI_HEADS.NUM_CLASSES = 15 if use_class14 else 14 detectron2_cfg.MODEL.WEIGHTS = os.path.join(output_dir, renamed_model) # path to the model we just trained detectron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = cfg.data.score_th # set a custom testing threshold predictor = DefaultPredictor(detectron2_cfg) dataset_dicts = get_test_xray_dict(data_dir) # Visualize ------------------------------------------------------ target_image_ids = ['9a5094b2563a1ef3ff50dc5c7ff71345', '22b8e616a61bbc4caaed0cf23b7159df', '001d127bad87592efe45a5c7678f8b8d', '008b3176a7248a0a189b5731ac8d2e95'] for th in [0, 0.2, 0.5, 0.7]: visualize(target_image_ids, data_dir, output_dir, predictor, score_th=th) # Metrics if os.path.exists(os.path.join(output_dir, 'metrics.json')): metrics_df = pd.read_json(os.path.join(output_dir, 'metrics.json'), orient="records", lines=True) mdf = metrics_df.sort_values("iteration") mdf3 = mdf[~mdf["bbox/AP75"].isna()].reset_index(drop=True) for i in range(len(mdf3)): row = mdf3.iloc[i] wandb.log({'AP40': row["bbox/AP75"] / 100.}) best_score = mdf3["bbox/AP75"].max() / 100. wandb.log({'Best-AP40-Score': best_score}) # Inference ------------------------------------------------------ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') sub = get_submission(dataset_dicts, cfg, predictor, device) now = datetime.datetime.now() + datetime.timedelta(hours=9) now = now.strftime("%Y%m%d-%H%M%S") filename = f'submission_{now}.csv' sub.to_csv(os.path.join('./submission', filename), index=False) wandb.save(os.path.join('./submission', filename)) time.sleep(30) wandb.finish() DatasetCatalog.clear()
def main(cfg: DictConfig): cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) seed_everything(cfg.train.seed) # Comet.ml experiment = Experiment(api_key=API_KEY, project_name=PROJECT_NAME) # Load Data ################################################################ # Chris Dataset chris_image_size = cfg.data.load_size data_dir = f'./input/_Chris_Dataset_{chris_image_size}' train = pd.read_csv(os.path.join(data_dir, 'train.csv')) test = pd.read_csv(os.path.join(data_dir, 'test.csv')) img_paths = { 'train': glob.glob(os.path.join(data_dir, 'train', '*.jpg')), 'test': glob.glob(os.path.join(data_dir, 'test', '*.jpg')) } # Cross Validation ######################################################### # GroupKFold cv = GroupKFold(n_splits=5) train['fold'] = -1 for i, (trn_idx, val_idx) in enumerate( cv.split(train, train['target'], groups=train['patient_id'].tolist())): train.loc[val_idx, 'fold'] = i # Preprocessing ############################################################ # Drop Image drop_image_name = [ 'ISIC_4579531', 'ISIC_7918608', 'ISIC_0948240', 'ISIC_4904364', 'ISIC_8780369', 'ISIC_8770180', 'ISIC_7148656', 'ISIC_7408392', 'ISIC_9959813', 'ISIC_1894141', 'ISIC_6633174', 'ISIC_3001941', 'ISIC_4259290', 'ISIC_6833905', 'ISIC_7452152', 'ISIC_2744859', 'ISIC_5464206', 'ISIC_6596403', 'ISIC_0711790', 'ISIC_5644568', 'ISIC_5843094', 'ISIC_8904326', 'ISIC_4963405', 'ISIC_9839042', 'ISIC_1355907', 'ISIC_0694037', 'ISIC_9513918', 'ISIC_0787851', 'ISIC_2932886', 'ISIC_2336763', 'ISIC_4064330', 'ISIC_7358293', 'ISIC_5789052', 'ISIC_7828320', 'ISIC_8277969', 'ISIC_1080647', 'ISIC_3238159', 'ISIC_8480913', 'ISIC_3790692', 'ISIC_0612624', 'ISIC_1242543', 'ISIC_4036915', 'ISIC_8174647', 'ISIC_2956783', 'ISIC_3302289', 'ISIC_6761105', 'ISIC_2152755', 'ISIC_9169000', 'ISIC_6852275', 'ISIC_4432898', 'ISIC_5459207', 'ISIC_7418664', 'ISIC_5136612', 'ISIC_9174738', 'ISIC_3160301', 'ISIC_7140636', 'ISIC_7718384', 'ISIC_9336675', 'ISIC_4282719', 'ISIC_4330005', 'ISIC_9828463', 'ISIC_6511141', 'ISIC_5335139', 'ISIC_5104921', 'ISIC_0695575', 'ISIC_0610141', 'ISIC_5946998', 'ISIC_0464315', 'ISIC_6556513', 'ISIC_3688407', 'ISIC_7730443', 'ISIC_4358550', 'ISIC_6461484', 'ISIC_9690422', 'ISIC_5374076', 'ISIC_1793200', 'ISIC_1389620', 'ISIC_8098274', 'ISIC_6425888', 'ISIC_6321076', 'ISIC_4298309', 'ISIC_2981912', 'ISIC_3650938', 'ISIC_4288522', 'ISIC_9459785', 'ISIC_1938535', 'ISIC_5576241', 'ISIC_6567889', 'ISIC_2768800', 'ISIC_6023795', 'ISIC_9281339', 'ISIC_6712494', 'ISIC_1811256', 'ISIC_5157055', 'ISIC_3943097', 'ISIC_7194471', 'ISIC_0361529', 'ISIC_9797578', 'ISIC_3575926', 'ISIC_6166824', 'ISIC_8828670', 'ISIC_6953126', 'ISIC_4430815', 'ISIC_8146054', 'ISIC_9305209', 'ISIC_4263017', 'ISIC_9314144', 'ISIC_1330763', 'ISIC_4792936', 'ISIC_1823608', 'ISIC_4910683', 'ISIC_9360142', 'ISIC_2863809', 'ISIC_4748668', 'ISIC_5681315', 'ISIC_3202829', 'ISIC_3450978', 'ISIC_9704624', 'ISIC_4350914', 'ISIC_3587744', 'ISIC_8190321', 'ISIC_1766413', 'ISIC_2872769', 'ISIC_3186625', 'ISIC_0170059', 'ISIC_4858099', 'ISIC_0314462', 'ISIC_2811886', 'ISIC_2140099', 'ISIC_9514450', 'ISIC_1195354', 'ISIC_8325872', 'ISIC_0227038', 'ISIC_6342641', 'ISIC_4162828', 'ISIC_7597293', 'ISIC_5278307', 'ISIC_3774190', 'ISIC_2957196', 'ISIC_4443545', 'ISIC_3455136', 'ISIC_0610499', 'ISIC_8483008', 'ISIC_0243683', 'ISIC_9028131', 'ISIC_8507102', 'ISIC_7128535', 'ISIC_4085552', 'ISIC_2940763', 'ISIC_1219894', 'ISIC_1043313', 'ISIC_6587979', 'ISIC_7050773', 'ISIC_3230164', 'ISIC_5159557', 'ISIC_7854457', 'ISIC_2582493', 'ISIC_5161114', 'ISIC_5238910', 'ISIC_6515221', 'ISIC_7771339', 'ISIC_9274260', 'ISIC_8054626', 'ISIC_1178847', 'ISIC_0236778', 'ISIC_6704518', 'ISIC_4214813', 'ISIC_0322818', 'ISIC_0230209', 'ISIC_7682938', 'ISIC_1852500', 'ISIC_3699454', 'ISIC_4693693', 'ISIC_9574591', 'ISIC_3465766', 'ISIC_1826803', 'ISIC_6234881', 'ISIC_2417958', 'ISIC_8142203', 'ISIC_5019268', 'ISIC_3251719', 'ISIC_4654808', 'ISIC_1027856', 'ISIC_3262153', 'ISIC_4681838', 'ISIC_6594555', 'ISIC_8623291', 'ISIC_3167092', 'ISIC_8791163', 'ISIC_1538510', 'ISIC_3962218', 'ISIC_2160145', 'ISIC_7690654', 'ISIC_9464203', 'ISIC_4673844', 'ISIC_9481260', 'ISIC_5407240', 'ISIC_5179742', 'ISIC_8851901', 'ISIC_7433711', 'ISIC_5777548', 'ISIC_2164933', 'ISIC_7194695', 'ISIC_7115605', 'ISIC_7560157', 'ISIC_1323909', 'ISIC_0307958', 'ISIC_8015259', 'ISIC_3089729', 'ISIC_3048886', 'ISIC_0861066', 'ISIC_6110309', 'ISIC_9103289', 'ISIC_2853454', 'ISIC_1436572', 'ISIC_9650546', 'ISIC_8208962', 'ISIC_5218561', 'ISIC_3285862', 'ISIC_5361506', 'ISIC_8196660', 'ISIC_0356238', 'ISIC_1156392', 'ISIC_2761440', 'ISIC_0645462', 'ISIC_4908514', 'ISIC_1374795', 'ISIC_3481768', 'ISIC_2102371', 'ISIC_4548990', 'ISIC_7200676', 'ISIC_8827725', 'ISIC_0667149', 'ISIC_7028320', 'ISIC_5485142', 'ISIC_9698871', 'ISIC_7764481', 'ISIC_8831706', 'ISIC_4478276', 'ISIC_0401250', 'ISIC_6987824', 'ISIC_7789537', 'ISIC_1114860', 'ISIC_7586566', 'ISIC_0343061', 'ISIC_1442157', 'ISIC_9161937', 'ISIC_5904214', 'ISIC_8335489', 'ISIC_9994768', 'ISIC_4384331', 'ISIC_0639415', 'ISIC_0982984', 'ISIC_2195070', 'ISIC_9022865', 'ISIC_0159060', 'ISIC_4933735', 'ISIC_3571989', 'ISIC_8593130', 'ISIC_1585919', 'ISIC_3907656', 'ISIC_9728805', 'ISIC_6029052', 'ISIC_3582787', 'ISIC_2205007', 'ISIC_1447559' ] train = train[~train['image_name'].isin(drop_image_name)].reset_index( drop=True) # Preprocessing metadata # OneHotEncoder train, test = preprocessing_meta(train, test) features_num = len([ f for f in train.columns if f not in ['image_name', 'patient_id', 'target', 'fold'] ]) # Model #################################################################### net = ENet(model_name=cfg.train.model_name, meta_features_num=features_num) transform = ImageTransform(img_size=cfg.data.img_size, input_res=chris_image_size) # Lightning Module ######################################################### model = MelanomaSystem(net, cfg, img_paths, train, test, transform, experiment) checkpoint_callback = ModelCheckpoint(filepath='./checkpoint', save_top_k=1, verbose=True, monitor='avg_val_loss', mode='min', prefix=cfg.exp.exp_name + '_') trainer = Trainer(max_epochs=cfg.train.epoch, checkpoint_callback=checkpoint_callback, gpus=[0]) # Train & Test ############################################################ # Train trainer.fit(model) experiment.log_metric('best_auc', model.best_auc) checkpoint_path = glob.glob(f'./checkpoint/{cfg.exp.exp_name}_*.ckpt')[0] experiment.log_asset(file_data=checkpoint_path) # Test for i in range(test_num): trainer.test(model) # Submit sub_list = glob.glob(f'submission_{cfg.exp.exp_name}*.csv') _ = summarize_submit(sub_list, experiment, filename=f'submission_all_{cfg.exp.exp_name}.csv') # oof valid_dataset = MelanomaDataset(train, img_paths['train'], transform, phase='test') valid_dataloader = DataLoader(valid_dataset, batch_size=cfg.train.batch_size, pin_memory=False, shuffle=False, drop_last=False) for i in range(10): trainer.test(model, test_dataloaders=valid_dataloader) # Submit sub_list = glob.glob('submission*.csv') _ = summarize_submit(sub_list, experiment, filename=f'submission_oof_{cfg.exp.exp_name}.csv') # Reset del model, trainer, net, experiment
def run(config: dict, holdout: bool, debug: bool) -> None: log("Run with configuration:") log(f"{config}") seed_everything(config["seed"]) with span("Load train and test set:"): train_test_set = load_train_test_set(config) log(f"{train_test_set.shape}") emb_df = pd.read_csv("./data/interim/emb_df.csv") n_emb = emb_df.shape[1] - 1 emb_cols = [str(i) for i in range(n_emb)] emb_df.rename(columns={"city_id": "past_city_id"}, inplace=True) with span("Preprocessing:"): with span("Shift target values for input sequence."): unk_city_id = 0 train_test_set["past_city_id"] = ( train_test_set.groupby("utrip_id")["city_id"].shift(1).fillna( unk_city_id).astype(int)) unk_hotel_country = "UNK" train_test_set["past_hotel_country"] = ( train_test_set.groupby("utrip_id")["hotel_country"].shift( 1).fillna(unk_hotel_country).astype(str)) train_test_set = pd.merge(train_test_set, emb_df, on="past_city_id", how="left") train_test_set[emb_cols] = train_test_set[emb_cols].fillna(0) train_test_set["city_embedding"] = train_test_set[emb_cols].apply( lambda x: list(x), axis=1) with span("Encode of target values."): target_le = preprocessing.LabelEncoder() train_test_set["city_id"] = target_le.fit_transform( train_test_set["city_id"]) train_test_set["past_city_id"] = target_le.transform( train_test_set["past_city_id"]) with span("Add features."): log("Convert data type of checkin and checkout.") train_test_set["checkin"] = pd.to_datetime( train_test_set["checkin"]) train_test_set["checkout"] = pd.to_datetime( train_test_set["checkout"]) log("Create month_checkin feature.") train_test_set["month_checkin"] = train_test_set[ "checkin"].dt.month train_test_set["year_checkin"] = train_test_set["checkin"].dt.year log("Create days_stay feature.") train_test_set["days_stay"] = ( train_test_set["checkout"] - train_test_set["checkin"]).dt.days.apply(lambda x: np.log10(x)) log("Create num_checkin feature.") train_test_set["num_checkin"] = (train_test_set.groupby( "utrip_id")["checkin"].rank().apply(lambda x: np.log10(x))) log("Create days_move feature.") train_test_set["past_checkout"] = train_test_set.groupby( "utrip_id")["checkout"].shift(1) train_test_set["days_move"] = ( (train_test_set["checkin"] - train_test_set["past_checkout"] ).dt.days.fillna(0).apply(lambda x: np.log1p(x))) log("Create aggregation features.") num_visit_drop_duplicates = train_test_set.query("city_id != 0")[[ "user_id", "city_id" ]].drop_duplicates().groupby("city_id").size().apply( lambda x: np.log1p(x)).reset_index() num_visit_drop_duplicates.columns = [ "past_city_id", "num_visit_drop_duplicates" ] num_visit = train_test_set.query("city_id != 0")[[ "user_id", "city_id" ]].groupby("city_id").size().apply( lambda x: np.log1p(x)).reset_index() num_visit.columns = ["past_city_id", "num_visit"] num_visit_same_city = train_test_set[ train_test_set['city_id'] == train_test_set['city_id'].shift( 1)].groupby("city_id").size().apply( lambda x: np.log1p(x)).reset_index() num_visit_same_city.columns = [ "past_city_id", "num_visit_same_city" ] train_test_set = pd.merge(train_test_set, num_visit_drop_duplicates, on="past_city_id", how="left") train_test_set = pd.merge(train_test_set, num_visit, on="past_city_id", how="left") train_test_set = pd.merge(train_test_set, num_visit_same_city, on="past_city_id", how="left") train_test_set["num_visit_drop_duplicates"].fillna(0, inplace=True) train_test_set["num_visit"].fillna(0, inplace=True) train_test_set["num_visit_same_city"].fillna(0, inplace=True) train_test_set["num_stay_consecutively"] = train_test_set.groupby( ["utrip_id", "past_city_id"])["past_city_id"].rank( method="first").fillna(1).apply(lambda x: np.log1p(x)) with span("Encode of categorical values."): cat_le = {} for c in CATEGORICAL_COLS: le = preprocessing.LabelEncoder() train_test_set[c] = le.fit_transform( train_test_set[c].fillna("UNK").astype(str).values) cat_le[c] = le train = train_test_set[train_test_set["row_num"].isnull()] test = train_test_set[~train_test_set["row_num"].isnull()] with span("aggregate features by utrip_id"): x_train, x_test_using_train, x_test = [], [], [] for c in ["city_id", "past_city_id" ] + CATEGORICAL_COLS + NUMERICAL_COLS: x_train.append(train.groupby("utrip_id")[c].apply(list)) x_test.append(test.groupby("utrip_id")[c].apply(list)) x_test_using_train.append( test.groupby("utrip_id")[c].apply(lambda x: list(x)[:-1])) x_train = pd.concat(x_train, axis=1) x_test = pd.concat(x_test, axis=1) x_test_using_train = pd.concat(x_test_using_train, axis=1) with span("sampling training data"): x_train["n_trips"] = x_train["city_id"].map(lambda x: len(x)) x_test_using_train["n_trips"] = x_test_using_train["city_id"].map( lambda x: len(x)) x_train = (x_train.query("n_trips > 2").sort_values( "n_trips").reset_index(drop=True)) x_test_using_train = ( x_test_using_train.sort_values("n_trips").reset_index( drop=True)) x_test = x_test.reset_index(drop=True) log(f"x_train: {x_train.shape}, x_test: {x_test.shape}") if debug: log("'--debug' specified. Shrink data size into 1000.") x_train = x_train.iloc[:1000] x_test = x_test.iloc[:1000] config["params"]["num_epochs"] = 2 log(f"x_train: {x_train.shape}, x_test: {x_test.shape}") with span("Prepare data loader for test:"): test_dataset = Dataset(x_test, is_train=False) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=1, num_workers=os.cpu_count(), pin_memory=True, collate_fn=Collator(is_train=False), shuffle=False, ) with span("Get folds:"): cv = StratifiedKFold( n_splits=config["fold"]["n_splits"], shuffle=config["fold"]["shuffle"], ) folds = cv.split(x_train, pd.cut(x_train["n_trips"], 5, labels=False)) log("Training:") oof_preds = np.zeros((len(x_train), len(target_le.classes_)), dtype=np.float32) test_preds = np.zeros((len(x_test), len(target_le.classes_)), dtype=np.float32) for i_fold, (trn_idx, val_idx) in enumerate(folds): if holdout and i_fold > 0: break with span(f"Fold = {i_fold}"): x_trn = x_train.loc[trn_idx, :] x_val = x_train.loc[val_idx, :] x_trn = pd.concat([x_trn, x_test_using_train], axis=0, ignore_index=True) train_dataset = Dataset(x_trn, is_train=True) valid_dataset = Dataset(x_val, is_train=True) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config["params"]["bacth_size"], num_workers=os.cpu_count(), pin_memory=True, collate_fn=Collator(is_train=True), shuffle=True, ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, num_workers=os.cpu_count(), pin_memory=True, collate_fn=Collator(is_train=True), shuffle=False, ) model_cls = MODELS[config["model_name"]] model = model_cls( n_city_id=len(target_le.classes_), n_booker_country=len(cat_le["booker_country"].classes_), n_device_class=len(cat_le["device_class"].classes_), n_affiliate_id=len(cat_le["affiliate_id"].classes_), n_month_checkin=len(cat_le["month_checkin"].classes_), n_hotel_country=len(cat_le["past_hotel_country"].classes_), emb_dim=config["params"]["emb_dim"], rnn_dim=config["params"]["rnn_dim"], dropout=config["params"]["dropout"], rnn_dropout=config["params"]["rnn_dropout"], ) if i_fold == 0: log(f"{summary(model)}") criterion = FocalLossWithOutOneHot(gamma=0.5) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=1e-4, weight_decay=0.01, ) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=30, eta_min=1e-6) logdir = (Path(config["output_dir_path"]) / config["exp_name"] / f"fold{i_fold}") loaders = {"train": train_dataloader, "valid": valid_dataloader} runner = CustomRunner(device=DEVICE) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, main_metric="accuracy04", minimize_metric=False, logdir=logdir, num_epochs=config["params"]["num_epochs"], verbose=True, ) log("Predictions using validation data") oof_preds[val_idx, :] = np.array( list( map( lambda x: x.cpu().numpy()[-1, :], runner.predict_loader( loader=valid_dataloader, resume=f"{logdir}/checkpoints/best.pth", model=model, ), ))) y_val = x_val["city_id"].map(lambda x: x[-1]).values score = top_k_accuracy_score(y_val, oof_preds[val_idx, :], k=4, labels=np.arange( len(target_le.classes_))) log(f"val acc@4: {score}") np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_val_pred_fold{i_fold}", oof_preds[val_idx, :], ) test_preds_ = np.array( list( map( lambda x: x.cpu().numpy()[-1, :], runner.predict_loader( loader=test_dataloader, resume=f"{logdir}/checkpoints/best.pth", model=model, ), ))) test_preds += test_preds_ / cv.n_splits np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_test_pred_fold{i_fold}", test_preds_, ) log("Evaluation OOF valies:") y_train = x_train["city_id"].map(lambda x: x[-1]) score = top_k_accuracy_score(y_train, oof_preds, k=4, labels=np.arange(len(target_le.classes_))) log(f"oof acc@4: {score}") log("Save files:") np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_oof_pred", oof_preds, ) np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_test_pred", test_preds, )
def run(X_seq_train, X_cont_train, y_train, X_seq_test, X_cont_test, timestamp, random_state): seed_everything(random_state) oof_preds = np.zeros(len(X_seq_train)) test_preds = np.zeros(len(X_seq_test)) cv_scores = [] for i, (trn_idx, val_idx) in enumerate( get_folds(5, "stratified", random_state).split(X_cont_train, y_train)): print(f"fold {i + 1}") train_dataset = TensorDataset( torch.from_numpy(X_seq_train[trn_idx]).float(), torch.from_numpy(X_cont_train[trn_idx]).float(), torch.from_numpy(y_train[trn_idx]).float(), ) valid_dataset = TensorDataset( torch.from_numpy(X_seq_train[val_idx]).float(), torch.from_numpy(X_cont_train[val_idx]).float(), torch.from_numpy(y_train[val_idx]).float(), ) test_dataset = TensorDataset( torch.from_numpy(X_seq_test).float(), torch.from_numpy(X_cont_test).float()) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32) valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=128) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=128) loaders = {"train": train_loader, "valid": valid_loader} runner = CustomRunner(device="cuda") model = Model( in_channels=X_seq_train.shape[1], n_cont_features=X_cont_train.shape[1], hidden_channels=64, kernel_sizes=[3, 5, 7, 15, 21, 51, 101], out_dim=1, ) criterion = torch.nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6) logdir = f"./logdir/{timestamp}_fold{i}" runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=30, verbose=True, ) pred = np.concatenate( list( map( lambda x: x.cpu().numpy(), runner.predict_loader( loader=valid_loader, resume=f"{logdir}/checkpoints/best.pth", model=model, ), ))) oof_preds[val_idx] = pred score = average_precision_score(y_train[val_idx], pred) cv_scores.append(score) print("score", score) pred = np.concatenate( list( map( lambda x: x.cpu().numpy(), runner.predict_loader( loader=test_loader, resume=f"{logdir}/checkpoints/best.pth", model=model, ), ))) test_preds += pred / 5 return oof_preds, test_preds, cv_scores
import numpy as np from src.get_folds import Fold from src.runner import Runner from src.utils import get_logger, json_dump, seed_everything from src.submission import create_submission from features.base import load_features from models.model_1d_cnn import Model_1DCNN from multiprocessing import cpu_count from tensorflow import keras import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns seed_everything(71, gpu_mode=True) model_map = {'1dcnn': Model_1DCNN} def main(): # ========================================= # === Settings # ========================================= # Get logger logger = get_logger(__name__) logger.info('Settings') # Get argument parser = argparse.ArgumentParser() parser.add_argument('--config', default='./configs/model_1dcnn_0.json')
def main(cfg: DictConfig): print('Cassava Leaf Disease Classification') cur_dir = hydra.utils.get_original_cwd() os.chdir(cur_dir) # Config ------------------------------------------------------------------- data_dir = './input' seed_everything(cfg.data.seed) # Comet_ml experiment = Experiment(api_key=cfg.comet_ml.api_key, project_name=cfg.comet_ml.project_name, auto_param_logging=False, auto_metric_logging=False) # Log Parameters experiment.log_parameters(dict(cfg.data)) experiment.log_parameters(dict(cfg.train)) # Data Module --------------------------------------------------------------- transform = get_transforms(transform_name=cfg.data.transform, img_size=cfg.data.img_size) cv = StratifiedKFold(n_splits=cfg.data.n_splits, shuffle=True, random_state=cfg.data.seed) dm = CassavaDataModule(data_dir, cfg, transform, cv, use_merge=True, sample=DEBUG) # Model ---------------------------------------------------------------------- net = Timm_model(cfg.train.model_type, pretrained=True) # Log Model Graph experiment.set_model_graph(str(net)) # Loss fn --------------------------------------------------------------------- df = pd.read_csv('./input/merged.csv') weight = df['label'].value_counts().sort_index().tolist() weight = [w / len(df) for w in weight] weight = torch.tensor(weight).cuda() del df criterion = get_loss_fn(cfg.train.loss_fn, weight=weight, smoothing=0.05) # Optimizer, Scheduler -------------------------------------------------------- if cfg.train.use_sam: base_optimizer = RAdam optimizer = SAM(net.parameters(), base_optimizer, lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) else: optimizer = RAdam(net.parameters(), lr=cfg.train.lr, weight_decay=cfg.train.weight_decay) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.train.epoch, eta_min=0) # Lightning Module ------------------------------------------------------------- model = CassavaLightningSystem(net, cfg, criterion=criterion, optimizer=optimizer, scheduler=scheduler, experiment=experiment) # Trainer ------------------------------------------------------------------------- trainer = Trainer( logger=False, max_epochs=cfg.train.epoch, gpus=-1, amp_backend='apex', amp_level='O2', num_sanity_val_steps=0, # Skip Sanity Check automatic_optimization=False if cfg.train.use_sam else True, # resume_from_checkpoint='./checkpoints/epoch=3-step=14047.ckpt' ) # Train trainer.fit(model, datamodule=dm)
# data directory TRAIN_DATA_A_DIR = config.domain_a_dir TRAIN_DATA_B_DIR = config.domain_b_dir # save directory MODEL_G_DIR = f'{ROOT_DIR}/checkpoint/G/{str(No).zfill(3)}' MODEL_F_DIR = f'{ROOT_DIR}/checkpoint/F/{str(No).zfill(3)}' MODEL_DA_DIR = f'{ROOT_DIR}/checkpoint/D1/{str(No).zfill(3)}' MODEL_DB_DIR = f'{ROOT_DIR}/checkpoint/D2/{str(No).zfill(3)}' LOG_DIR = f'{ROOT_DIR}/output/log/{str(No).zfill(3)}' FIGURE_DIR = f'{ROOT_DIR}/output/figure/{str(No).zfill(3)}' PRED_VAL_A_TO_B_DIR = f'{ROOT_DIR}/output/pred_val_a_to_b/{str(No).zfill(3)}' PRED_VAL_B_TO_A_DIR = f'{ROOT_DIR}/output/pred_val_b_to_a/{str(No).zfill(3)}' seed_everything(SEED) kf = KFold(n_splits=N_SPLIT, shuffle=True, random_state=SEED) img_ids_A = np.array(sorted(os.listdir(TRAIN_DATA_A_DIR))) tr_ix, va_ix = list(kf.split(img_ids_A, img_ids_A))[FOLD] train_A, valid_A = img_ids_A[tr_ix], img_ids_A[va_ix] img_ids_B = np.array(sorted(os.listdir(TRAIN_DATA_B_DIR))) train_B, valid_B = img_ids_B, img_ids_B seed_everything(DATA_LOADER_SEED) if DEBUG: train_A = train_A[:2] valid_A = valid_A[:2] train_dataset = PokemonTrainDataset(train_A,
def run_experiment(version, key): time_experiment = datetime.now().strftime("%m%d%Y_%H%M") seed_everything(0) # Predefine functions modellers = { "logistic": clf_logistic, "lgb": clf_lgb, "xgb": clf_xgb, "catboost": clf_catboost, } ########################### READ PARAMETERS conf = read_configuration(key) classifier = conf["classifier"] params = conf["params"] split = conf["splits"] logger.info("Begin run experiment") ########################### LOADING DATASET logger.info("Loading dataset") ds = Dataset() ds.load_dataset(version) ########################### BUILD CROSS VALIDATION STRATEGY logger.info("Build folds") date_ranges = [ # [["2018-01-01", "2018-05-31"], ["2017-12-01", "2017-12-31"]], [["2017-12-01", "2018-04-15"], ["2018-05-01", "2018-05-31"]] ] splits = { "holdout": CustomDateSplitter(ds.X_train["TransactionDT"], date_ranges), "kfold": KFold(n_splits=6, random_state=0, shuffle=False), } folds = splits[split] ########################### PREPROCESSING DATA logger.info("Preprocessing data") build_processed_dataset(ds) gc.collect() ########################### TRAIN MODEL logger.info(f"Building {classifier} model") result = run_train_predict(ds, modellers[classifier], params, folds) ########################### SAVING if conf["save_predictions"]: path_to_preds = f"{key}_{time_experiment}" logger.info(f"Saving {key} predictions to {path_to_preds}") ds.submission["isFraud"] = result["prediction"] ds.write_submission(path_to_preds) if conf["save_models"]: # ds.save_dataset(f"{key}_processed") path_to_models = get_root_dir() / f"models/{key}_{time_experiment}" logger.info(f"Saving raw models to {path_to_models}") os.mkdir(path_to_models) write_params(params, path_to_models / "params.json") for i, model in enumerate(result["models"]): save_model(model, path_to_models / f"fold_{i}") open(path_to_models / "_SUCCESS", "a").close() logger.info("End run experiment")
def main(train_path, test_path, max_features, max_len, glove_path, para_path, model_save_path, epochs=4, batch_size=512, seed=1029): logger_path = os.path.join(model_save_path, "log.txt") setup_logger(out_file=logger_path) train = pd.read_csv(train_path) test = pd.read_csv(test_path) train = df_parallelize_run(train, text_clean_wrapper) test = df_parallelize_run(test, text_clean_wrapper) tk = Tokenizer(lower=True, filters='', num_words=max_features) full_text = list(train['question_text'].values) + list(test['question_text'].values) tk.fit_on_texts(full_text) train_tokenized = tk.texts_to_sequences(train['question_text'].fillna('missing')) test_tokenized = tk.texts_to_sequences(test['question_text'].fillna('missing')) word_index = tk.word_index X_train = pad_sequences(train_tokenized, maxlen=max_len) X_test = pad_sequences(test_tokenized, maxlen=max_len) y_train = train['target'].values get_embedding = GetEmbedding(max_features, word_index) glove = get_embedding.load(glove_path, emb_mean=-0.005838499, emb_std=0.48782197) para = get_embedding.load(para_path, emb_mean=-0.0053247833, emb_std=0.49346462) embedding_matrix = glove * 0.8 + para * 0.2 del glove, para; gc.collect() x_test_cuda = torch.tensor(X_test, dtype=torch.long).cuda() test = torch.utils.data.TensorDataset(x_test_cuda) test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False) splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=10).split(X_train, y_train)) seed_everything() train_preds = np.zeros(len(train)) test_preds = np.zeros((len(test), len(splits))) for i, (train_idx, valid_idx) in enumerate(splits): model_path = os.path.join(model_save_path, "model_fold{}".format(i + 1)) x_train_fold = torch.tensor(X_train[train_idx], dtype=torch.long).cuda() y_train_fold = torch.tensor(y_train[train_idx, np.newaxis], dtype=torch.float32).cuda() x_val_fold = torch.tensor(X_train[valid_idx], dtype=torch.long).cuda() y_val_fold = torch.tensor(y_train[valid_idx, np.newaxis], dtype=torch.float32).cuda() train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold) valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold) train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False) print(f'Fold {i + 1}') seed_everything(seed + i) model = NeuralNet(max_features, max_len, embedding_matrix) model.cuda() optimizer = torch.optim.Adam(model.parameters()) scheduler = CosineAnnealingLR(optimizer, T_max=3) loss = torch.nn.BCEWithLogitsLoss(reduction='mean').cuda() trainer = Trainer(model, train_loader, valid_loader, y_val_fold, test_loader, loss, optimizer, scheduler, model_path, epochs, batch_size) valid_preds_fold, test_preds_fold = trainer.run(validate=True) train_preds[valid_idx] = valid_preds_fold test_preds[:, i] = test_preds_fold search_result = threshold_search(y_train, train_preds) print(search_result) test_preds = test_preds.mean(1) > search_result['threshold'] return test_preds
from src.runner import Runner from src.utils import get_logger, json_dump, seed_everything from src.submission import create_submission from features.base import load_features from models.model_lightgbm import Model_LightGBM from multiprocessing import cpu_count import lightgbm as lgb from sklearn.model_selection import train_test_split import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns seed_everything(71) model_map = {'lightgbm': Model_LightGBM} def main(): # ========================================= # === Settings # ========================================= # Get logger logger = get_logger(__name__) logger.info('Settings') # Get argument parser = argparse.ArgumentParser() parser.add_argument('--config', default='./configs/model_0.json')
parser.add_argument('--output_channel', type=int, default=256) parser.add_argument('--hidden_size', type=int, default=256) parser.add_argument('--num_fiducial', type=int, default=20, help='number of fiducial points of TPS-STN') args = parser.parse_args() assert args.dataset_name in CONFIGS if args.checkpoint_path: seed = round(datetime.utcnow().timestamp() ) % 10000 # warning! in resume need change seed else: seed = args.seed utils.seed_everything(seed) config = CONFIGS[args.dataset_name]( data_dir=args.data_dir, experiment_name=args.experiment_name, experiment_description=args.experiment_description, image_w=args.image_w, image_h=args.image_h, num_epochs=args.num_epochs, bs=args.bs, num_workers=args.num_workers, seed=seed, batch_max_length=args.batch_max_length, FeatureExtraction=args.FeatureExtraction, SequenceModeling=args.SequenceModeling, Prediction=args.Prediction,
def main(cfg): SEED = cfg.values.seed BATCH_SIZE = cfg.values.train_args.batch_size IMAGE_SIZE = cfg.values.image_size USE_KFOLD = cfg.values.use_kfold NUM_FOLD = cfg.values.train_args.num_fold if USE_KFOLD else 0 seed_everything(SEED) print(f'Cuda is Available ? : {torch.cuda.is_available()}\n') data_df = pd.read_csv('E:/seti-breakthrough-listen/train_labels.csv') data_df['file_path'] = data_df['id'].apply(get_train_file_path) train_transform = albumentations.Compose([ albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE), albumentations.HorizontalFlip(), albumentations.VerticalFlip(), # albumentations.Normalize(mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)), albumentations.pytorch.transforms.ToTensorV2() ]) val_transform = albumentations.Compose([ albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE), # albumentations.Normalize(mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)), albumentations.pytorch.transforms.ToTensorV2() ]) if USE_KFOLD: kfold = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=SEED) for k, (train_index, val_index) in enumerate(kfold.split(data_df, data_df['target'])): print('\n') cpprint('=' * 15 + f'{k + 1}-Fold Cross Validation' + '=' * 15) train_df = data_df.iloc[train_index].reset_index(drop=True) val_df = data_df.iloc[val_index].reset_index(drop=True) train_loader = get_dataloader(df=train_df, transform=train_transform, batch_size=BATCH_SIZE, shuffle=True) val_loader = get_dataloader(df=val_df, transform=val_transform, batch_size=BATCH_SIZE, shuffle=False) val_labels = val_df['target'].values.tolist() train(cfg, train_loader, val_loader, val_labels, k + 1) else: print('\n') cpprint('=' * 15 + f'Start Training' + '=' * 15) train_df, val_df = train_test_split(data_df, test_size=0.2, shuffle=True, stratify=data_df['target'], random_state=SEED) train_loader = get_dataloader(df=train_df, transform=train_transform, batch_size=BATCH_SIZE, shuffle=True) val_loader = get_dataloader(df=val_df, transform=val_transform, batch_size=BATCH_SIZE, shuffle=False) val_labels = val_df['target'].values.tolist() train(cfg, train_loader, val_loader, val_labels, 0)
import torch import torch.utils import mlflow import datetime import matplotlib.pyplot as plt import cv2 from tqdm.autonotebook import tqdm from src.types import Boxes from typing import List, Dict, Any config = Config(".") config.n_folds = 0 seed_everything(config.seed) transforms: Transforms = get_transforms() start_time = datetime.datetime.now().isoformat() with timer("load raw data"): data: WheatData = get_data(config) cv_num = 0 with timer("prepare dataloader and fitter"): train_image_ids, train_df, val_image_ids, val_df = data.get_fold(cv_num) train_dataset: WheatDataset = get_wheat_dataset( config.INPUT_DIR, train_image_ids, train_df, "train",
import mlflow from torch_optimizer import RAdam from warmup_scheduler import GradualWarmupScheduler from src.utils import seed_everything, ImageTransform from src.utils import Trainer, QWKLoss, Trainer_multifold, get_dataloaders, Santa from src.model import ModelEFN, ModelEFN_2 if os.name == 'nt': sep = '\\' else: sep = '/' seed = 42 seed_everything(seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @hydra.main('config.yml') def main(cfg: DictConfig): # Config ################################################################ IMAGE_NUM = cfg.data.image_num IMAGE_SIZE = cfg.data.image_size exp_name = cfg.data.exp model_name = f'efficientnet-{cfg.data.model_name}' BATCH_SIZE = cfg.training.batch_size lr = cfg.training.lr NUM_EPOCHS = cfg.training.num_epoch FOLD = cfg.training.fold OPTIMIZER = cfg.training.optimizer
from torchvision import transforms from tqdm import tqdm from src.data_loader import get_data_loader from src.evaluation import performance_plot from src.loss import get_loss_function from src.model import DecoderRNN, DecoderRNNUpdated, EncoderCNN from src.optimizer import get_optimizer from src.utils import Config, seed_everything COMMET_ML_API_KEY = os.environ.get("COMMET_ML_API_KEY") experiment = Experiment(api_key=COMMET_ML_API_KEY, project_name="image_caption_generation") print("Seed everything. Ensure reproducibility...") seed_everything(seed=42) if __name__ == "__main__": config = Config("config.yaml") if config.DEV_MODE: warnings.warn(f"Running in dev_mode: {config.DEV_MODE}") # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # (Optional) TODO #2: Amend the image transform below. transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.RandomHorizontalFlip(
from src.train import train from src import config from src.utils import seed_everything import pandas as pd import os from sklearn.model_selection import train_test_split, StratifiedKFold from datetime import datetime if __name__ == '__main__': print("SEED number : %d" % config.SEED) seed_everything(config.SEED) now = datetime.now() now = f'{now.year}{now.month}{now.day}{now.hour}{now.minute}' # Define param nfold = 5 model_name = 'b4' n_epochs = 25 pretraining = True # weight_path = "input/efn_b4_nfNone_ep13_vl0.3339_vk0.7768_acc0.7882.pt" weight_path = None print("N Fold : {}, Model : EFN_{}, N_epochs : {}".format( nfold, model_name, n_epochs)) # load Dataset # train_csv = pd.read_csv(os.path.join(config.DATA_PATH, 'prev_curr_train_v2.csv')) train_csv = pd.read_csv( os.path.join(config.DATA_PATH, 'prev_curr_train.csv')) # train_csv = pd.read_csv(os.path.join(config.DATA_PATH, 'prev_curr_train_v1_1.csv'))
import numpy as np import pandas as pd from sklearn import preprocessing import torch from torch.utils.data import DataLoader from tqdm import tqdm from src.datasets import load_train_test, BookingDataset from src.models import BookingNN from src.utils import seed_everything from src.runner import CustomRunner if __name__ == '__main__': seed_everything(0) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(device) categorical_cols = [ 'user_id', # 'device_class', # 'affiliate_id', 'booker_country', # 'hotel_country' ] train_test = load_train_test() cat_dims = [int(train_test[col].nunique()) for col in categorical_cols] emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims] target_le = preprocessing.LabelEncoder()
def __init__(self, settings, config): self.settings = settings self.config = config self.logger = get_logger() seed_everything(seed=settings.SEED)