def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "line_id": [], "prediction": [], } for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(test_csv, self.config) result = { "line_id": list(df["line_id"]), "prediction": [], } def chunker(seq, size): return (seq[pos:pos+size] for pos in range(0, len(seq), size)) for chunk in chunker(df, 100000): X = chunk.copy() preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.sort_values("line_id", inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "id": [], "prediction": [], } for X in pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"] ): result["id"] += list(X["id"]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) return result
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config["model"] = {} self.config["ensemble"] = {"lgb": 1} self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) # load holiday path_holiday = './holiday.csv' holiday = pd.read_csv(path_holiday, \ encoding='utf-8', low_memory=False, dtype={'holiday':str})['holiday'].values self.config['holiday'] = set(holiday) df = read_df(train_csv, self.config) print(df.shape) holiday_detect(df, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config)
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) train(X, y, self.config)
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = {"line_id": [], "prediction": []} if 'holiday_detect' in self.config: result["datetime"] = [] for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) if 'holiday_detect' in self.config: dt_fea = self.config['holiday_detect'] result["datetime"] += list(X[dt_fea]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) # post process for holiday if 'holiday_detect' in self.config: holiday = self.config['holiday'] for idx, row in result.iterrows(): dt = row['datetime'] dt_str = str(dt).split(' ')[0].strip() if dt_str in holiday or dt.weekday() == 5 or dt.weekday() == 6: result.loc[idx, 'prediction'] = 0 result.drop(["datetime"], axis=1, inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) ## prepare data df = read_df(train_csv, self.config) ## preprecessing preprocess(df, self.config) y = df["target"] X = df.drop("target", axis=1) log('drop target') log('####### cur time = ' + str(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) log('################## after FE #########################') log(X.shape) log('#####################################################') train(X, y, self.config)
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) self.config["prediction_csv"] = prediction_csv self.config["line_id"] = [] self.config["start_time"] = time.time() result = { "line_id": [], "prediction": [], } X = pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], ) self.config["line_id"] = X["line_id"].values result["line_id"] = (X["line_id"].values) X = preprocess(X, self.config) X = X[self.config["columns"]] # for right columns order result["prediction"] = predict(X, self.config) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"], self.config) else: score = None return result, score
def train(self, train_csv: str, mode: str): self.config["task"] = "train" self.config["mode"] = mode self.config[ "objective"] = "regression" if mode == "regression" else "binary" self.config["metric"] = "rmse" if mode == "regression" else "auc" self.config.tmp_dir = self.config.model_dir + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(train_csv, self.config) df = preprocess(df, self.config) y = df["target"].copy() X = df.drop("target", axis=1).copy() del df gc.collect() self.config["columns"] = list(X) train(X, y, self.config)
import sys import os import logging from lib.utils import read_yaml from lib.preprocess import preprocess, transform_to_long, save_pred_long_df from model.dcrnn_top import train_dcrnn, run_dcrnn sys.path.append(os.getcwd()) args = read_yaml('dcrnn_config.yaml') args, dataloaders, adj_mx, node_ids = preprocess(args) args = train_dcrnn(args, dataloaders, adj_mx) args, pred_df = run_dcrnn(args, dataloaders, adj_mx, node_ids) long_df = transform_to_long(pred_df) save_pred_long_df(args, long_df) logging.shutdown()
def main(): test_args = parse_args() args = joblib.load('models/%s/args.pkl' % test_args.name) print('Config -----') for arg in vars(args): print('%s: %s' % (arg, getattr(args, arg))) print('------------') if args.pred_type == 'classification': num_outputs = 5 elif args.pred_type == 'regression': num_outputs = 1 elif args.pred_type == 'multitask': num_outputs = 6 else: raise NotImplementedError cudnn.benchmark = True test_transform = transforms.Compose([ transforms.Resize((args.input_size)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) # data loading code test_dir = preprocess('test', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) test_df = pd.read_csv('inputs/test.csv') test_img_paths = test_dir + '/' + test_df['id_code'].values + '.png' test_labels = np.zeros(len(test_img_paths)) test_set = Dataset(test_img_paths, test_labels, transform=test_transform) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=4) preds = [] for fold in range(args.n_splits): print('Fold [%d/%d]' % (fold + 1, args.n_splits)) # create model model_path = 'models/%s/model_%d.pth' % (args.name, fold + 1) if not os.path.exists(model_path): print('%s is not exists.' % model_path) continue model = get_model(model_name=args.arch, num_outputs=num_outputs, freeze_bn=args.freeze_bn, dropout_p=args.dropout_p) model = model.cuda() model.load_state_dict(torch.load(model_path)) model.eval() preds_fold = [] with torch.no_grad(): for i, (input, _) in tqdm(enumerate(test_loader), total=len(test_loader)): if test_args.tta: outputs = [] for input in apply_tta(input): input = input.cuda() output = model(input) outputs.append(output.data.cpu().numpy()[:, 0]) preds_fold.extend(np.mean(outputs, axis=0)) else: input = input.cuda() output = model(input) preds_fold.extend(output.data.cpu().numpy()[:, 0]) preds_fold = np.array(preds_fold) preds.append(preds_fold) if not args.cv: break preds = np.mean(preds, axis=0) if test_args.tta: args.name += '_tta' test_df['diagnosis'] = preds test_df.to_csv('probs/%s.csv' % args.name, index=False) thrs = [0.5, 1.5, 2.5, 3.5] preds[preds < thrs[0]] = 0 preds[(preds >= thrs[0]) & (preds < thrs[1])] = 1 preds[(preds >= thrs[1]) & (preds < thrs[2])] = 2 preds[(preds >= thrs[2]) & (preds < thrs[3])] = 3 preds[preds >= thrs[3]] = 4 preds = preds.astype('int') test_df['diagnosis'] = preds test_df.to_csv('submissions/%s.csv' % args.name, index=False)
def main(): args = parse_args() if args.name is None: args.name = '%s_%s' % (args.arch, datetime.now().strftime('%m%d%H')) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('- %s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('- %s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) if args.loss == 'CrossEntropyLoss': criterion = nn.CrossEntropyLoss().cuda() elif args.loss == 'FocalLoss': criterion = FocalLoss().cuda() elif args.loss == 'MSELoss': criterion = nn.MSELoss().cuda() elif args.loss == 'multitask': criterion = { 'classification': nn.CrossEntropyLoss().cuda(), 'regression': nn.MSELoss().cuda(), } else: raise NotImplementedError if args.pred_type == 'classification': num_outputs = 5 elif args.pred_type == 'regression': num_outputs = 1 elif args.loss == 'multitask': num_outputs = 6 else: raise NotImplementedError cudnn.benchmark = True model = get_model(model_name=args.arch, num_outputs=num_outputs, freeze_bn=args.freeze_bn, dropout_p=args.dropout_p) train_transform = [] train_transform = transforms.Compose([ transforms.Resize((args.img_size, args.img_size)), transforms.RandomAffine( degrees=(args.rotate_min, args.rotate_max) if args.rotate else 0, translate=(args.translate_min, args.translate_max) if args.translate else None, scale=(args.rescale_min, args.rescale_max) if args.rescale else None, shear=(args.shear_min, args.shear_max) if args.shear else None, ), transforms.CenterCrop(args.input_size), transforms.RandomHorizontalFlip(p=0.5 if args.flip else 0), transforms.RandomVerticalFlip(p=0.5 if args.flip else 0), transforms.ColorJitter( brightness=0, contrast=args.contrast, saturation=0, hue=0), RandomErase( prob=args.random_erase_prob if args.random_erase else 0, sl=args.random_erase_sl, sh=args.random_erase_sh, r=args.random_erase_r), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) val_transform = transforms.Compose([ transforms.Resize((args.img_size, args.input_size)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) # data loading code if 'diabetic_retinopathy' in args.train_dataset: diabetic_retinopathy_dir = preprocess( 'diabetic_retinopathy', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) diabetic_retinopathy_df = pd.read_csv('inputs/diabetic-retinopathy-resized/trainLabels.csv') diabetic_retinopathy_img_paths = \ diabetic_retinopathy_dir + '/' + diabetic_retinopathy_df['image'].values + '.jpeg' diabetic_retinopathy_labels = diabetic_retinopathy_df['level'].values if 'aptos2019' in args.train_dataset: aptos2019_dir = preprocess( 'aptos2019', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) aptos2019_df = pd.read_csv('inputs/train.csv') aptos2019_img_paths = aptos2019_dir + '/' + aptos2019_df['id_code'].values + '.png' aptos2019_labels = aptos2019_df['diagnosis'].values if args.train_dataset == 'aptos2019': skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41) img_paths = [] labels = [] for fold, (train_idx, val_idx) in enumerate(skf.split(aptos2019_img_paths, aptos2019_labels)): img_paths.append((aptos2019_img_paths[train_idx], aptos2019_img_paths[val_idx])) labels.append((aptos2019_labels[train_idx], aptos2019_labels[val_idx])) elif args.train_dataset == 'diabetic_retinopathy': img_paths = [(diabetic_retinopathy_img_paths, aptos2019_img_paths)] labels = [(diabetic_retinopathy_labels, aptos2019_labels)] elif 'diabetic_retinopathy' in args.train_dataset and 'aptos2019' in args.train_dataset: skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41) img_paths = [] labels = [] for fold, (train_idx, val_idx) in enumerate(skf.split(aptos2019_img_paths, aptos2019_labels)): img_paths.append((np.hstack((aptos2019_img_paths[train_idx], diabetic_retinopathy_img_paths)), aptos2019_img_paths[val_idx])) labels.append((np.hstack((aptos2019_labels[train_idx], diabetic_retinopathy_labels)), aptos2019_labels[val_idx])) # else: # raise NotImplementedError if args.pseudo_labels: test_df = pd.read_csv('probs/%s.csv' % args.pseudo_labels) test_dir = preprocess( 'test', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) test_img_paths = test_dir + '/' + test_df['id_code'].values + '.png' test_labels = test_df['diagnosis'].values for fold in range(len(img_paths)): img_paths[fold] = (np.hstack((img_paths[fold][0], test_img_paths)), img_paths[fold][1]) labels[fold] = (np.hstack((labels[fold][0], test_labels)), labels[fold][1]) if 'messidor' in args.train_dataset: test_dir = preprocess( 'messidor', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) folds = [] best_losses = [] best_scores = [] for fold, ((train_img_paths, val_img_paths), (train_labels, val_labels)) in enumerate(zip(img_paths, labels)): print('Fold [%d/%d]' %(fold+1, len(img_paths))) if os.path.exists('models/%s/model_%d.pth' % (args.name, fold+1)): log = pd.read_csv('models/%s/log_%d.csv' %(args.name, fold+1)) best_loss, best_score = log.loc[log['val_loss'].values.argmin(), ['val_loss', 'val_score']].values folds.append(str(fold + 1)) best_losses.append(best_loss) best_scores.append(best_score) continue if args.remove_duplicate: md5_df = pd.read_csv('inputs/strMd5.csv') duplicate_img_paths = aptos2019_dir + '/' + md5_df[(md5_df.strMd5_count > 1) & (~md5_df.diagnosis.isnull())]['id_code'].values + '.png' print(duplicate_img_paths) for duplicate_img_path in duplicate_img_paths: train_labels = train_labels[train_img_paths != duplicate_img_path] train_img_paths = train_img_paths[train_img_paths != duplicate_img_path] val_labels = val_labels[val_img_paths != duplicate_img_path] val_img_paths = val_img_paths[val_img_paths != duplicate_img_path] # train train_set = Dataset( train_img_paths, train_labels, transform=train_transform) _, class_sample_counts = np.unique(train_labels, return_counts=True) # print(class_sample_counts) # weights = 1. / torch.tensor(class_sample_counts, dtype=torch.float) # weights = np.array([0.2, 0.1, 0.6, 0.1, 0.1]) # samples_weights = weights[train_labels] # sampler = WeightedRandomSampler( # weights=samples_weights, # num_samples=11000, # replacement=False) train_loader = torch.utils.data.DataLoader( train_set, batch_size=args.batch_size, shuffle=False if args.class_aware else True, num_workers=4, sampler=sampler if args.class_aware else None) val_set = Dataset( val_img_paths, val_labels, transform=val_transform) val_loader = torch.utils.data.DataLoader( val_set, batch_size=args.batch_size, shuffle=False, num_workers=4) # create model model = get_model(model_name=args.arch, num_outputs=num_outputs, freeze_bn=args.freeze_bn, dropout_p=args.dropout_p) model = model.cuda() if args.pretrained_model is not None: model.load_state_dict(torch.load('models/%s/model_%d.pth' % (args.pretrained_model, fold+1))) # print(model) if args.optimizer == 'Adam': optimizer = optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'AdamW': optimizer = optim.AdamW( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'RAdam': optimizer = RAdam( filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) elif args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.scheduler == 'CosineAnnealingLR': scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs, eta_min=args.min_lr) elif args.scheduler == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=args.factor, patience=args.patience, verbose=1, min_lr=args.min_lr) log = pd.DataFrame(index=[], columns=[ 'epoch', 'loss', 'score', 'val_loss', 'val_score' ]) log = { 'epoch': [], 'loss': [], 'score': [], 'val_loss': [], 'val_score': [], } best_loss = float('inf') best_score = 0 for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch + 1, args.epochs)) # train for one epoch train_loss, train_score = train( args, train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_loss, val_score = validate(args, val_loader, model, criterion) if args.scheduler == 'CosineAnnealingLR': scheduler.step() elif args.scheduler == 'ReduceLROnPlateau': scheduler.step(val_loss) print('loss %.4f - score %.4f - val_loss %.4f - val_score %.4f' % (train_loss, train_score, val_loss, val_score)) log['epoch'].append(epoch) log['loss'].append(train_loss) log['score'].append(train_score) log['val_loss'].append(val_loss) log['val_score'].append(val_score) pd.DataFrame(log).to_csv('models/%s/log_%d.csv' % (args.name, fold+1), index=False) if val_loss < best_loss: torch.save(model.state_dict(), 'models/%s/model_%d.pth' % (args.name, fold+1)) best_loss = val_loss best_score = val_score print("=> saved best model") print('val_loss: %f' % best_loss) print('val_score: %f' % best_score) folds.append(str(fold + 1)) best_losses.append(best_loss) best_scores.append(best_score) results = pd.DataFrame({ 'fold': folds + ['mean'], 'best_loss': best_losses + [np.mean(best_losses)], 'best_score': best_scores + [np.mean(best_scores)], }) print(results) results.to_csv('models/%s/results.csv' % args.name, index=False) torch.cuda.empty_cache() if not args.cv: break
def main(): args = parse_args() np.random.seed(args.seed) cudnn.benchmark = False cudnn.deterministic = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) if args.name is None: args.name = '%s_%s' % (args.arch, datetime.now().strftime('%m%d%H')) if not os.path.exists('models/%s' % args.name): os.makedirs('models/%s' % args.name) print('Config -----') for arg in vars(args): print('- %s: %s' % (arg, getattr(args, arg))) print('------------') with open('models/%s/args.txt' % args.name, 'w') as f: for arg in vars(args): print('- %s: %s' % (arg, getattr(args, arg)), file=f) joblib.dump(args, 'models/%s/args.pkl' % args.name) if args.loss == 'CrossEntropyLoss': criterion = nn.CrossEntropyLoss().cuda() elif args.loss == 'FocalLoss': criterion = FocalLoss().cuda() elif args.loss == 'MSELoss': criterion = nn.MSELoss().cuda() elif args.loss == 'multitask': criterion = { 'classification': nn.CrossEntropyLoss().cuda(), 'regression': nn.MSELoss().cuda(), } else: raise NotImplementedError if args.pred_type == 'classification': num_outputs = 5 elif args.pred_type == 'regression': num_outputs = 1 elif args.loss == 'multitask': num_outputs = 6 else: raise NotImplementedError train_transform = transforms.Compose([ transforms.Resize((args.img_size, args.img_size)), transforms.RandomAffine( degrees=(args.rotate_min, args.rotate_max) if args.rotate else 0, translate=(args.translate_min, args.translate_max) if args.translate else None, scale=(args.rescale_min, args.rescale_max) if args.rescale else None, shear=(args.shear_min, args.shear_max) if args.shear else None, ), transforms.CenterCrop(args.input_size), transforms.RandomHorizontalFlip(p=0.5 if args.flip else 0), transforms.RandomVerticalFlip(p=0.5 if args.flip else 0), transforms.ColorJitter(brightness=0, contrast=args.contrast, saturation=0, hue=0), RandomErase(prob=args.random_erase_prob if args.random_erase else 0, sl=args.random_erase_sl, sh=args.random_erase_sh, r=args.random_erase_r), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) val_transform = transforms.Compose([ transforms.Resize((args.img_size, args.input_size)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ]) # data loading code if 'diabetic_retinopathy' in args.train_dataset: diabetic_retinopathy_dir = preprocess('diabetic_retinopathy', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) diabetic_retinopathy_df = pd.read_csv( 'inputs/diabetic-retinopathy-resized/trainLabels.csv') diabetic_retinopathy_img_paths = \ diabetic_retinopathy_dir + '/' + diabetic_retinopathy_df['image'].values + '.jpeg' diabetic_retinopathy_labels = diabetic_retinopathy_df['level'].values if 'aptos2019' in args.train_dataset: aptos2019_dir = preprocess('aptos2019', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) aptos2019_df = pd.read_csv('inputs/train.csv') aptos2019_img_paths = aptos2019_dir + '/' + aptos2019_df[ 'id_code'].values + '.png' aptos2019_labels = aptos2019_df['diagnosis'].values if 'chestxray' in args.train_dataset: chestxray_dir = preprocess('chestxray', args.img_size, scale=args.scale_radius, norm=args.normalize, pad=args.padding, remove=args.remove) chestxray_img_paths = [] chestxray_labels = [] normal_cases = glob('chest_xray/chest_xray/train/NORMAL/*.jpeg') pneumonia_cases = glob('chest_xray/chest_xray/train/PNEUMONIA/*.jpeg') for nor in normal_cases: p = nor.split('/')[-1] chestxray_img_paths.append(chestxray_dir + '/' + p) chestxray_labels.append(0) for abn in pneumonia_cases: p = abn.split('/')[-1] chestxray_img_paths.append(chestxray_dir + '/' + p) chestxray_labels.append(1) normal_cases = glob('chest_xray/chest_xray/test/NORMAL/*.jpeg') pneumonia_cases = glob('chest_xray/chest_xray/test/PNEUMONIA/*.jpeg') for nor in normal_cases: p = nor.split('/')[-1] chestxray_img_paths.append(chestxray_dir + '/' + p) chestxray_labels.append(0) for abn in pneumonia_cases: p = abn.split('/')[-1] chestxray_img_paths.append(chestxray_dir + '/' + p) chestxray_labels.append(1) normal_cases = glob('chest_xray/chest_xray/val/NORMAL/*.jpeg') pneumonia_cases = glob('chest_xray/chest_xray/val/PNEUMONIA/*.jpeg') for nor in normal_cases: p = nor.split('/')[-1] chestxray_img_paths.append(chestxray_dir + '/' + p) chestxray_labels.append(0) for abn in pneumonia_cases: p = abn.split('/')[-1] chestxray_img_paths.append(chestxray_dir + '/' + p) chestxray_labels.append(1) chestxray_img_paths = np.array(chestxray_img_paths) chestxray_labels = np.array(chestxray_labels) if args.train_dataset == 'aptos2019': skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41) img_paths = [] labels = [] for fold, (train_idx, val_idx) in enumerate( skf.split(aptos2019_img_paths, aptos2019_labels)): img_paths.append( (aptos2019_img_paths[train_idx], aptos2019_img_paths[val_idx])) labels.append( (aptos2019_labels[train_idx], aptos2019_labels[val_idx])) elif args.train_dataset == 'diabetic_retinopathy': img_paths = [(diabetic_retinopathy_img_paths, aptos2019_img_paths)] labels = [(diabetic_retinopathy_labels, aptos2019_labels)] elif 'diabetic_retinopathy' in args.train_dataset and 'aptos2019' in args.train_dataset: skf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=41) img_paths = [] labels = [] for fold, (train_idx, val_idx) in enumerate( skf.split(aptos2019_img_paths, aptos2019_labels)): img_paths.append((np.hstack((aptos2019_img_paths[train_idx], diabetic_retinopathy_img_paths)), aptos2019_img_paths[val_idx])) labels.append((np.hstack( (aptos2019_labels[train_idx], diabetic_retinopathy_labels)), aptos2019_labels[val_idx])) # FL setting: separate data into users if 'diabetic_retinopathy' in args.train_dataset and 'aptos2019' in args.train_dataset: combined_paths = np.hstack( (aptos2019_img_paths, diabetic_retinopathy_img_paths)) combined_labels = np.hstack( (aptos2019_labels, diabetic_retinopathy_labels)) elif 'chestxray' in args.train_dataset: combined_paths = chestxray_img_paths combined_labels = chestxray_labels else: raise NotImplementedError user_ind_dict, ind_test = split_dataset(combined_labels, args.num_users, args.iid) model = get_model(model_name=args.arch, num_outputs=num_outputs, freeze_bn=args.freeze_bn, dropout_p=args.dropout_p) model = model.cuda() test_set = Dataset(combined_paths[ind_test], combined_labels[ind_test], transform=val_transform) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=4) test_acc = [] test_scores = [] test_scores_f1 = [] lr = args.lr for epoch in range(args.epochs): print('Epoch [%d/%d]' % (epoch + 1, args.epochs)) weight_list = [] selected_ind = np.random.choice(args.num_users, int(args.num_users / 10), replace=False) for i in selected_ind: print('user: %d' % (i + 1)) train_set = Dataset(combined_paths[user_ind_dict[i]], combined_labels[user_ind_dict[i]], transform=train_transform) train_loader = torch.utils.data.DataLoader( train_set, batch_size=args.batch_size, shuffle=False if args.class_aware else True, num_workers=4, sampler=sampler if args.class_aware else None) # train for one epoch train_loss, train_score, ret_w = train(args, train_loader, copy.deepcopy(model), criterion, lr) weight_list.append(ret_w) print('loss %.4f - score %.4f' % (train_loss, train_score)) weights = fedavg(weight_list) model.load_state_dict(weights) test_loss, test_score, test_scoref1, accuracy, confusion_matrix = test( args, test_loader, copy.deepcopy(model), criterion) print('loss %.4f - score %.4f - accuracy %.4f' % (test_loss, test_score, accuracy)) test_acc.append(accuracy) test_scores.append(test_score) test_scores_f1.append(test_scoref1) lr *= 0.992 np.savez('./accuracy-xray-iid' + str(args.iid) + '-' + str(args.epochs) + '-beta' + str(args.beta) + '-seed' + str(args.seed), acc=np.array(test_acc), score=np.array(test_scores), scoref1=np.array(test_scores_f1), confusion=confusion_matrix)