def Dataloader(split, fold): if split == 'train': fold_idx = [i for i in range(1, config['#fold'] + 1) if i != fold] dataset = Dataset(split=split, fold_idx=fold_idx) if config['sampler'] == 'ROS': labels = dataset.labels class_weight = { label: 1 / float(labels.count(label)) for label in range(config['#class']) } weight = [class_weight[label] for label in labels] sampler = data.WeightedRandomSampler(weight, len(weight) * config['#sample_ROS'], replacement=True) shuffle = False elif config['sampler'] == 'None': sampler = None shuffle = True else: fold_idx = [fold] dataset = Dataset(split=split, fold_idx=fold_idx) sampler = None shuffle = False data_loader = data.DataLoader(dataset, shuffle=shuffle, sampler=sampler, worker_init_fn=_init_fn, batch_size=config['#batch'], num_workers=config['#worker']) return data_loader
def get_dataloader(dataset, batch_size=128, clip=False, weights=None): if weights is not None: eps = 1e-1 if clip: mean = weights.mean() var = weights.var() k = 2 upper_bound = mean + k * var lower_bound = max(mean - k * var, eps) weight_list = np.array([ lower_bound if i < lower_bound else (upper_bound if i > upper_bound else i) for i in weights ]) else: weight_list = np.array([eps if i < eps else i for i in weights]) sampler = data.WeightedRandomSampler(weight_list, len(weight_list), replacement=True) print( f'weight_list max: {weight_list.max()} min: {weight_list.min()} mean: {weight_list.mean()} var: {weight_list.var()}' ) else: sampler = None dataloader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False if sampler else True, sampler=sampler, num_workers=8, pin_memory=True) return dataloader
def make_batch_loader(self, batch_size=None, shuffle=None): shuffle = self.shuffle if shuffle is None else shuffle if self.sampling is not None and shuffle: print(f"SAMPLING with {self.sampling}") num_answers = defaultdict(int) for q in self.questions: num_answers[q["answer"]] += 1 if self.sampling == "uniform_answer": weights = [1 / num_answers[q["answer"]] for q in self.questions] sampler = data.WeightedRandomSampler(weights=weights, num_samples=len(self)) batch_loader = data.DataLoader( dataset=self, batch_size=Options()["dataset.batch_size"], sampler=sampler, shuffle=False, pin_memory=Options()["misc.cuda"], num_workers=Options()["dataset.nb_threads"], collate_fn=self.collate_fn, ) else: batch_loader = data.DataLoader( dataset=self, batch_size=Options()["dataset.batch_size"], shuffle=self.shuffle if shuffle is None else shuffle, pin_memory=Options()["misc.cuda"], num_workers=Options()["dataset.nb_threads"], collate_fn=self.collate_fn, sampler=None, ) return batch_loader
def generateCaption(self, feature, stochastic=False): initial_input = torch.ones((feature.shape[0], 1)).long().to('cuda') #torch.tensor(1).to('cuda') # this is the '<start>' captions_compact = self.wordEmbedded(initial_input) feature = torch.unsqueeze(feature, 0) h_init = feature res = [] for i in range(self.max_length): #print(lstm_input.shape) rnn_out, h_out = self.rnn(captions_compact, h_init) final = self.linear_Embed2Word(rnn_out) #print(lstm_final_word.shape) final = final.squeeze() if stochastic: final = self.caption_softmax(final / self.temperature) predicted = data.WeightedRandomSampler(weights=final, num_samples=1, replacement=False) predicted = torch.tensor(list(predicted)).long().to('cuda') predicted = torch.squeeze(predicted) else: _, predicted = final.max(1) #print(predicted.shape, predicted) res.append(predicted) captions_compact = self.wordEmbedded(predicted) captions_compact = torch.unsqueeze(captions_compact, 1) res = torch.stack(res, 1) #print(res.shape) return res
def make_loader_mt(dataset, batch_size): """Construct sampler that randomly chooses N items from N-sample dataset, weighted so that it's even across all tasks (so no task implicitly has higher priority than the others). Assumes the given dataset is a TensorDataset produced by trajectories_to_dataset_mt.""" task_ids = dataset.tensor_dict['obs'].task_id assert len(task_ids) > 0 and batch_size > 0, \ f"either {len(task_ids)}=0 task IDs or {batch_size}=0 batch size" unique_ids, frequencies = torch.unique(task_ids, return_counts=True, sorted=True) # all tasks must be present for this to work assert torch.all(unique_ids == torch.arange(len(unique_ids))), (unique_ids) freqs_total = torch.sum(frequencies).to(torch.float) unique_weights = freqs_total / frequencies.to(torch.float) unique_weights = unique_weights / unique_weights.sum() weights = unique_weights[task_ids] # even out the number of samples to be a multiple of batch size, always n_samples = len(weights) + (-len(weights)) % batch_size assert n_samples >= len(weights) and 0 == n_samples % batch_size, \ (batch_size, n_samples) weighted_sampler = data.WeightedRandomSampler(weights, n_samples, replacement=True) batch_sampler = data.BatchSampler(weighted_sampler, batch_size=batch_size, drop_last=True) loader = data.DataLoader(dataset, pin_memory=False, batch_sampler=batch_sampler, collate_fn=fixed_default_collate) return loader
def generateCaption(self, feature, stochastic=False): initial_input = torch.ones((feature.shape[0], 1)).long().to('cuda') #torch.tensor(1).to('cuda') # this is the '<start>' lstm_input = self.wordEmbedded(initial_input) feature = torch.unsqueeze(feature, 0) hc_states = (feature, feature) res = [] for i in range(self.max_length): #print(lstm_input.shape) lstm_output, hc_states = self.lstm(lstm_input, hc_states) lstm_final_word = self.linear_Embed2Word(lstm_output) #print(lstm_final_word.shape) lstm_final_word = lstm_final_word.squeeze() #print(lstm_final_word, "This is lstm output for generation 1") if stochastic: #print("Stochastic", self.temperature) lstm_final_word = self.caption_softmax(lstm_final_word / self.temperature) predicted = data.WeightedRandomSampler(weights=lstm_final_word, num_samples=1, replacement=False) predicted = torch.tensor(list(predicted)).long().to('cuda') predicted = torch.squeeze(predicted) #print(predicted, predicted.shape) else: _, predicted = lstm_final_word.max(1) #predicted = torch.unsqueeze(predicted, 1) #print(predicted, "This is lstm output for generation 1") #print(predicted.shape, predicted) res.append(predicted) lstm_input = self.wordEmbedded(predicted) lstm_input = torch.unsqueeze(lstm_input, 1) res = torch.stack(res, 1) #print(res.shape) return res
def data_sampler(dataset, shuffle, distributed, weights=None): if distributed: return data.distributed.DistributedSampler(dataset, shuffle=shuffle) if weights is not None: return data.WeightedRandomSampler(weights, len(weights), replacement=True) if shuffle: return data.RandomSampler(dataset) else: return data.SequentialSampler(dataset)
def sampler(self): if self.cfg.AUGMENTATION.OVERSAMPLING == 'pixel': sampling_weights = np.array([ float(self._get_label_data(city).size) for city in self.cities ]) if self.cfg.AUGMENTATION.OVERSAMPLING == 'change': sampling_weights = np.array([ float(np.sum(self._get_label_data(city))) for city in self.cities ]) sampler = torch_data.WeightedRandomSampler(weights=sampling_weights, num_samples=self.length, replacement=True) return sampler
def get_dataloader(dataset, batch_size=128, clip=False, weights=None): if weights is not None: sampler = data.WeightedRandomSampler(weights, len(weights), replacement=True) print(f'weight_list max: {weights.max()} min: {weights.min()} mean: {weights.mean()} var: {weights.var()}') else: sampler = None dataloader = data.DataLoader( dataset=dataset, batch_size=batch_size, shuffle=False if sampler else True, sampler=sampler, num_workers=8, pin_memory=True) return dataloader
def get_dataloader(dataset, batch_size=128, weights=None, eps=1e-6): if weights is not None: weight_list = [eps if i < eps else i for i in weights] sampler = data.WeightedRandomSampler(weight_list, len(weight_list), replacement=True) else: sampler = None dataloader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False if sampler else True, sampler=sampler, num_workers=8, pin_memory=True) return dataloader
def _split_sampler(self, split): if split == 0.0: return None, None idx_full = np.arange(self.n_samples) np.random.seed(self.seed) np.random.shuffle(idx_full) if isinstance(split, int): assert split > 0 assert split < self.n_samples, "validation set size is configured to be larger than entire dataset." len_valid = split else: len_valid = int(self.n_samples * split) ################## # 制定了测试文件(valtest)就将其作为验证集,否则将训练集分拆出测试集 if self.val_file: valid_idx = self.valid_idx train_idx = np.array([idx for idx in idx_full if idx not in valid_idx]) else: valid_idx = idx_full[0:len_valid] train_idx = np.delete(idx_full, np.arange(0, len_valid)) ####################### weights_per_class = 1. / torch.tensor(self.emotion_nums, dtype=torch.float) weights = [0] * self.n_samples for idx in range(self.n_samples): if idx in valid_idx: weights[idx] = 0. else: label = self.dataset[idx][0] weights[idx] = weights_per_class[label] weights = torch.tensor(weights) train_sampler = data.WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True) valid_sampler = data.SubsetRandomSampler(valid_idx) # turn off shuffle option which is mutually exclusive with sampler self.shuffle = False self.n_samples = len(train_idx) return train_sampler, valid_sampler
def train_dataloader(self): # REQUIRED cfg = self.cfg use_edge_loss = cfg.MODEL.LOSS_TYPE == 'FrankensteinEdgeLoss' trfm = [] trfm.append(BGR2RGB()) if cfg.DATASETS.USE_CLAHE_VARI: trfm.append(VARI()) if cfg.AUGMENTATION.RESIZE: trfm.append(Resize(scale=cfg.AUGMENTATION.RESIZE_RATIO)) if cfg.AUGMENTATION.CROP_TYPE == 'uniform': trfm.append(UniformCrop(crop_size=cfg.AUGMENTATION.CROP_SIZE)) elif cfg.AUGMENTATION.CROP_TYPE == 'importance': trfm.append( ImportanceRandomCrop(crop_size=cfg.AUGMENTATION.CROP_SIZE)) if cfg.AUGMENTATION.RANDOM_FLIP_ROTATE: trfm.append(RandomFlipRotate()) trfm.append(Npy2Torch()) trfm = transforms.Compose(trfm) dataset = Xview2Detectron2Dataset( cfg.DATASETS.TRAIN[0], pre_or_post=cfg.DATASETS.PRE_OR_POST, include_image_weight=True, transform=trfm, include_edge_mask=use_edge_loss, use_clahe=cfg.DATASETS.USE_CLAHE_VARI, ) dataloader_kwargs = { 'batch_size': cfg.TRAINER.BATCH_SIZE, 'num_workers': cfg.DATALOADER.NUM_WORKER, 'shuffle': cfg.DATALOADER.SHUFFLE, 'drop_last': True, 'pin_memory': True, } # sampler if cfg.AUGMENTATION.IMAGE_OVERSAMPLING_TYPE == 'simple': image_p = self.image_sampling_weight(dataset.dataset_metadata) sampler = torch_data.WeightedRandomSampler( weights=image_p, num_samples=len(image_p)) dataloader_kwargs['sampler'] = sampler dataloader_kwargs['shuffle'] = False dataloader = torch_data.DataLoader(dataset, **dataloader_kwargs) return dataloader
def main(): # hyper parameter batch_size = 128 char_len = 150 MAX_epoch = 50 encode_dim = 250 feat_num = 256 device = torch.device('cuda:0') seed = 2434 data_path = Path('../data/train.csv') patience = 7 n_fold = 4 momentum = 0.9 lr = 0.01 ############################## set_random_seed(seed) X_data, y_data = load_data(data_path) train = [(x, y) for x, y in zip(X_data, y_data)] kf = KFold(n_splits=n_fold) for i, (train_idx, valid_idx) in enumerate(kf.split(train)): print(f'Fold : {i+1}') train_fold = [train[i] for i in train_idx] valid_fold = [train[i] for i in valid_idx] char2idx, ignore_idx = make_char2idx([text for text, _ in train]) target_arr = np.array([y for _, y in train_fold]) weight_dict = {i: np.sum(target_arr == i) for i in range(2)} weghit = 1/torch.Tensor([weight_dict[i] for i in target_arr]) sampler = data.WeightedRandomSampler(weghit, len(weghit)) train_data = PrepreprocessData( train_fold, char_len, char2idx, ignore_idx) valid_data = PrepreprocessData( valid_fold, char_len, char2idx, ignore_idx) train_loader = data.DataLoader(dataset=train_data, batch_size=batch_size, sampler=sampler, num_workers=4, pin_memory=True) valid_loader = data.DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True) model = CLCNN(encode_dim, char_len, len(char2idx), ignore_idx, feat_num).to(device) loss_func = nn.BCEWithLogitsLoss(reduction="sum") optimizer = torch.optim.SGD( model.parameters(), lr=lr, momentum=momentum) early_stopping = EarlyStopping(char2idx, patience=patience, verbose=True) for epoch in range(MAX_epoch): start_time = time.time() train_loss = trainer.train( model, train_loader, loss_func, device, optimizer) valid_loss = trainer.valid(model, valid_loader, loss_func, device) elapsed_time = time.time() - start_time print( f'Epoch {epoch+1}/{MAX_epoch} \t loss={train_loss:.4f} \t val_loss={valid_loss:.4f} \t time={elapsed_time:.2f}') early_stopping(valid_loss, model) if early_stopping.early_stop: print('stop') break print(f'{i}-fold best result valid loss : {early_stopping.best_score:2f}')
{"indices": [1]}, [], {}, data.SubsetRandomSampler(indices=[1]), id="SubsetRandomSamplerConf", ), pytest.param( "utils.data.sampler", "WeightedRandomSampler", { "weights": [1], "num_samples": 1 }, [], {}, data.WeightedRandomSampler(weights=[1], num_samples=1), id="WeightedRandomSamplerConf", ), # TODO: investigate testing distributed instantiation # pytest.param( # "utils.data.distributed", # "DistributedSampler", # {}, # [], # {"dataset": dummy_dataset}, # data.DistributedSampler(group=dummy_group,dataset=dummy_dataset), # id="DistributedSamplerConf", # ), ], ) def test_instantiate_classes(
os.environ['PYTHONHASHSEED']=str(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.backends.cudnn.enabled = False vocabulary = vocab.VocabDictionary() vocabulary.create_from_count_file(args.vocabulary) train_dataset = text_dataset.SegmentationTextDataset(args.train_corpus, vocabulary, args.min_split_samples_batch_ratio, args.unk_noise_prob) if args.min_split_samples_batch_ratio > 0.0: sampler = data.WeightedRandomSampler(train_dataset.weights, len(train_dataset.weights)) train_dataloader = data.DataLoader(train_dataset, num_workers=0, batch_size=args.batch_size, drop_last=True, collate_fn=text_dataset.collater, sampler=sampler) else: train_dataloader = data.DataLoader(train_dataset, num_workers=0, batch_size=args.batch_size, shuffle=True, drop_last=True, collate_fn=text_dataset.collater) dev_dataset = text_dataset.SegmentationTextDataset(args.dev_corpus, vocabulary) dev_dataloader = data.DataLoader(dev_dataset, num_workers=0, batch_size=args.batch_size, shuffle=False, drop_last=False, collate_fn=text_dataset.collater) if args.model_architecture == "ff_text": model = SimpleRNNFFTextModel(args, vocabulary).to(device) else:
def train_model(training_data, output_dir, params): """ Create the output file paths """ checkpoint_file = os.path.join(output_dir, "chkpt.pth") trained_model = os.path.join(output_dir, "model.pth") metrics_file = os.path.join(output_dir, 'metrics.csv') best_model_file = os.path.join(output_dir, 'model_early_stop.pth') """ Build the Data supply pipeline for the training and validation """ input_data = {} input_data['train'], input_data['eval'] = AudioSpectDataset.get_datasets( inputfile=training_data, train_ratio=0.9, label_dict=params["label_dict"], ) dataset_sizes = {x: len(input_data[x]) for x in ['train', 'eval']} """ Handle class imbalance by oversampling. Define the sampler """ all_labels = input_data['train'].get_all_labels() class_freq = np.bincount(all_labels) class_weights = 100.0 / class_freq each_samples_weight = class_weights[all_labels] sampler = data.WeightedRandomSampler(each_samples_weight, len(input_data['train'])) dataloaders = {} for x in ['train', 'eval']: if x == 'train': dataloaders[x] = data.DataLoader(input_data[x], batch_size=params["batch_size"], sampler=sampler, num_workers=1) else: dataloaders[x] = data.DataLoader(input_data[x], batch_size=params["batch_size"], num_workers=1) """ Getting ready for training 1. Set up the model instance. 2. If checkpoint exists from an interrupted previous run, load from the checkpoint 3. Set up the Loss function 4. Set up the Optimizer """ classifier = ChimpCallClassifier( num_labels=params["num_labels"], spectrogram_shape=params["spectrogram_shape"], dropout=params["dropout"]).float() device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if torch.cuda.device_count() > 1: classifier = nn.DataParallel(classifier) classifier.to(device) loss_func = torch.nn.CrossEntropyLoss() optimizer_func = torch.optim.Adam(classifier.parameters(), lr=params["learning_rate"], eps=params["epsilon"], weight_decay=params["weight_decay"]) exp_lr_scheduler = lr_scheduler.StepLR( optimizer_func, step_size=params["scheduler_step_size"], gamma=0.1) # if checkpoint from an interrupted run exists, load the model, optimizer, early stopper and metrics if os.path.exists(checkpoint_file): try: checkpoint_data = torch.load(checkpoint_file) classifier.load_state_dict(checkpoint_data['model_state_dict']) optimizer_func.load_state_dict(checkpoint_data['optim_state_dict']) start_epoch = checkpoint_data['epoch'] + 1 metrics_list = checkpoint_data['metrics'] lowest_loss = checkpoint_data['lowest_loss'] best_epoch = checkpoint_data['best_epoch'] print("Checkpoint found. Loaded!") print("Re-starting training with epoch# {}".format( start_epoch + 1)) # epochs shown on screen start from 1 except Exception as e: print("Error in reinstating interrupted run. Error: {}".format(e)) exit() else: start_epoch = 0 metrics_list = [] lowest_loss = 1e14 best_epoch = 0 """ Training and validation """ print("*" * 70) print("Training Starting") print( "To restart, when launching use the option: -o {}".format(output_dir)) print("*" * 70) for epoch in range(start_epoch, params["num_epochs"]): epoch_loss = {'train': 0.0, 'eval': 0.0} epoch_accuracy = {'train': 0.0, 'eval': 0.0} labels_input = [] labels_preds = [] precision = recall = f1 = 0 for phase in ['train', 'eval']: running_loss = 0.0 running_corrects = 0 if phase == 'train': classifier.train() else: classifier.eval() for samples in dataloaders[phase]: input_spects = samples['spectrogram'].to(device) input_labels = samples['label'].to(device) optimizer_func.zero_grad( ) # Clear off the gradients from any past operation with torch.set_grad_enabled(phase == 'train'): outputs = classifier(input_spects) # Do the forward pass loss = loss_func(outputs, input_labels) # Calculate the loss if phase == 'train': loss.backward( ) # Calculate the gradients with help of back propagation optimizer_func.step( ) # Ask the optimizer to adjust the parameters based on the gradients # Record the predictions _, predicted = torch.max( outputs, 1 ) # the indexes are the predicted classes. Need only that from torch.max # set up metrics running_loss += loss.item() * input_labels.size( 0) # accumulate the loss running_corrects += (predicted == input_labels).sum() if phase == 'eval': labels_preds.append(predicted) labels_input.append(input_labels) if phase == 'train': exp_lr_scheduler.step() """ Calculate performance metrics for the train and eval runs of this epoch """ epoch_loss[phase] = running_loss / dataset_sizes[phase] epoch_accuracy[phase] = running_corrects.item( ) / dataset_sizes[phase] if phase == 'eval': all_input_labels = torch.cat(labels_input).cpu() all_preds_labels = torch.cat(labels_preds).cpu() precision = precision_score(all_input_labels, all_preds_labels, average='weighted') recall = recall_score(all_input_labels, all_preds_labels, average='weighted') f1 = f1_score(all_input_labels, all_preds_labels, average='weighted') cfm = confusion_matrix(all_input_labels, all_preds_labels, labels=range(params["num_labels"])) # Finish up the Epoch: Save model & optimizer state, metrics and earlystop. Print performance. Check early stopping metrics_list.append({ "epoch": epoch + 1, "train_loss": epoch_loss['train'], "train_acc": epoch_accuracy['train'], "eval_loss": epoch_loss['eval'], "accuracy": epoch_accuracy['eval'], "precision": precision, "recall": recall, "f1": f1, "cfm": cfm }) print( 'Epoch %2d/%d, Training (Loss: %.4f, Acc: %.2f ), ' 'Validation (Loss: %.4f, Acc: %.2f , precision: %.2f, recall: %.2f, f1: %.2f) ' % (epoch + 1, params["num_epochs"], epoch_loss['train'], epoch_accuracy['train'] * 100, epoch_loss['eval'], epoch_accuracy['eval'] * 100, precision * 100, recall * 100, f1 * 100)) if epoch_loss['eval'] < lowest_loss: best_epoch = epoch lowest_loss = epoch_loss['eval'] torch.save(classifier.state_dict(), best_model_file) torch.save( { 'epoch': epoch, 'model_state_dict': classifier.state_dict(), 'optim_state_dict': optimizer_func.state_dict(), 'best_epoch': best_epoch, 'lowest_loss': lowest_loss, 'metrics': metrics_list }, checkpoint_file) """ Save to disk: final model and all epoch metrics as csv file Final Model: the best performing model is always saved at the location 'best_model_file' because of the Early Stop code """ classifier.load_state_dict(torch.load(best_model_file)) torch.save( { 'model': classifier.state_dict(), 'labels': {idx: label for label, idx in params["label_dict"].items()}, 'spectrogram_shape': params["spectrogram_shape"], 'dropout': params["dropout"] }, trained_model) pd.DataFrame(metrics_list, columns=[ "epoch", "train_loss", "train_acc", "eval_loss", "accuracy", "precision", "recall", "f1", "cfm" ]).to_csv(metrics_file, index=False, header=True) # clean up, remove the temporary files used to store runtime state if os.path.exists(best_model_file): os.remove(best_model_file) if os.path.exists(checkpoint_file): os.remove(checkpoint_file) print("Training Complete! Epoch #{:2d} saved".format(best_epoch + 1)) print("Model at: {}".format(trained_model)) print("Metrics at: {}".format(metrics_file))
def create_weighted_sampler(labels): labels_unique, counts = np.unique(labels, return_counts=True) class_weights = [sum(counts) / c for c in counts] example_weights = [class_weights[int(e)] for e in labels] sampler = data_utils.WeightedRandomSampler(example_weights, len(labels)) return sampler
def create_train_val_data_loaders(data_dir, *, min_pts=75, batch_size=32, validation_frac=0.2, num_of_workers=0): """ Return pair of pytorch dataloaders for train and validation sets. """ # sample => scale => (if train) random jitter and random rotation along z axis => transform to Pytorch tensor mps_transform = MinPointSampler(min_pts, replace_flag=True) pt_scaler = PointScaler() points_aug = RndPointsAugmentations(jitter_b=0.3) train_transforms = transforms.Compose( [mps_transform, points_aug, pt_scaler, transforms.ToTensor()]) val_transforms = transforms.Compose( [mps_transform, pt_scaler, transforms.ToTensor()]) train_data = datasets.DatasetFolder( data_dir, loader=lambda x: np.load(x).astype(np.float32), extensions=("npy"), transform=train_transforms) val_data = datasets.DatasetFolder( data_dir, loader=lambda x: np.load(x).astype(np.float32), extensions=("npy"), transform=val_transforms) dataset_len = len(train_data) indices = np.arange(dataset_len) val_abs_size = np.int(np.floor(validation_frac * dataset_len)) np.random.shuffle(indices) train_id, val_id = indices[val_abs_size:], indices[:val_abs_size] all_dataset = train_data.samples.copy() all_targets = train_data.targets.copy() train_data.samples = [all_dataset[i] for i in train_id] train_data.targets = [all_targets[i] for i in train_id] train_weight = 1 / np.array([(np.array(train_data.targets) == tgt).sum() for tgt in np.unique(train_data.targets)]) train_samples_weight = torch.tensor( [train_weight[tgt] for tgt in train_data.targets]) train_sampler = tdata.WeightedRandomSampler(train_samples_weight, len(train_samples_weight)) train_loader = tdata.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, num_workers=num_of_workers, drop_last=True) if validation_frac > 0: val_data.samples = [all_dataset[i] for i in val_id] val_data.targets = [all_targets[i] for i in val_id] # readjust probabilities for unbalanced classes val_weight = 1 / np.array([(np.array(val_data.targets) == tgt).sum() for tgt in np.unique(val_data.targets)]) val_samples_weight = torch.tensor( [val_weight[tgt] for tgt in val_data.targets]) val_sampler = tdata.WeightedRandomSampler(val_samples_weight, len(val_samples_weight)) val_loader = tdata.DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, num_workers=num_of_workers, drop_last=True) else: val_loader = None return train_loader, val_loader
def main(): ia.seed(1) train_datapath = "./food11re/skewed_training" valid_datapath = "./food11re/validation" test_datapath = "./food11re/evaluation" transform = transforms.Compose([ transforms.RandomRotation(30), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), ImgAugTransform(), lambda x: PIL.Image.fromarray(x), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) train_dataset = Food11Dataset(train_datapath, is_train=True) train_dataset_folder = torchvision.datasets.ImageFolder( root='./food11re/skewed_training', transform=transform) valid_dataset = Food11Dataset(valid_datapath, is_train=False) test_dataset = Food11Dataset(test_datapath, is_train=False) #wts = [100, 781, 67, 169, 196, 75, 757, 1190, 194, 67, 2857] #train_dataset.augmentation(wts) weight = [] for i in range(11): class_count = train_dataset_folder.targets.count(i) weight.append(1. / (class_count / len(train_dataset_folder.targets))) samples_weight = np.array([weight[t] for _, t in train_dataset_folder]) weighted_sampler = data.WeightedRandomSampler(samples_weight, num_samples=15000, replacement=True) randon_sampler = data.RandomSampler(train_dataset, replacement=True, num_samples=9000, generator=None) print( "----------------------------------------------------------------------------------" ) print("Dataset bf. loading - ", train_datapath) print(train_dataset.show_details()) print( "----------------------------------------------------------------------------------" ) print("Dataset bf. loading - ", valid_datapath) print(valid_dataset.show_details()) print( "----------------------------------------------------------------------------------" ) print("Dataset bf. loading - ", test_datapath) print(test_dataset.show_details()) train_folder_loader = DataLoader(dataset=train_dataset_folder, num_workers=0, batch_size=100, sampler=weighted_sampler) train_loader = DataLoader(dataset=train_dataset, num_workers=0, batch_size=100, sampler=randon_sampler) valid_loader = DataLoader(dataset=valid_dataset, num_workers=0, batch_size=100, shuffle=False) test_loader = DataLoader(dataset=test_dataset, num_workers=0, batch_size=100, shuffle=False) data_loading(train_folder_loader, train_dataset) data_loading(train_loader, train_dataset) data_loading(valid_loader, valid_dataset) data_loading(test_loader, test_dataset)
def train(net, data, classes, test_features, test_classes): criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9) optimizer = optim.Adam(net.parameters(), lr=0.05) t_dataset = Data.TensorDataset(data, classes) samples_weight = [1/5, 1/5, 1/5] sampler = Data.WeightedRandomSampler(samples_weight, 30) # Batch size is one loader = Data.DataLoader(dataset=t_dataset, batch_size=1, num_workers=0, shuffle=True) d = data[0] c = classes[0] co_far = 0 print() print() print() for epoch in range(99): # loop over the dataset multiple times ct = 0 running_loss = 0.0 for i, data in enumerate(loader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) outputs = torch.reshape(outputs, (1,3)) # print(outputs.tolist()) # print(outputs) # print(labels) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() ct += 1 if ((epoch+1) % 10 == 0): record_acc(net, test_features, test_classes) # cor = torch.argmax(net(torch.Tensor([d]))).item() == c.item() # if cor: # co_far += 1 # print(cor) # print() print('[%d] loss: %.8f' % (epoch + 1, running_loss/ct)) ct = 0 running_loss = 0.0 return net
def train(cfg, writer, logger, start_iter=0, model_only=False, gpu=-1, save_dir=None): # Setup seeds and config torch.manual_seed(cfg.get("seed", 1337)) torch.cuda.manual_seed(cfg.get("seed", 1337)) np.random.seed(cfg.get("seed", 1337)) random.seed(cfg.get("seed", 1337)) # Setup device if gpu == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: device = torch.device("cuda:%d" %gpu if torch.cuda.is_available() else "cpu") # Setup Augmentations augmentations = cfg["training"].get("augmentations", None) if cfg["data"]["dataset"] == "softmax_cityscapes_convention": data_aug = get_composed_augmentations_softmax(augmentations) else: data_aug = get_composed_augmentations(augmentations) # Setup Dataloader data_loader = get_loader(cfg["data"]["dataset"]) data_path = cfg["data"]["path"] t_loader = data_loader( data_path, config = cfg["data"], is_transform=True, split=cfg["data"]["train_split"], img_size=(cfg["data"]["img_rows"], cfg["data"]["img_cols"]), augmentations=data_aug, ) v_loader = data_loader( data_path, config = cfg["data"], is_transform=True, split=cfg["data"]["val_split"], img_size=(cfg["data"]["img_rows"], cfg["data"]["img_cols"]), ) sampler = None if "sampling" in cfg["data"]: sampler = data.WeightedRandomSampler( weights = get_sampling_weights(t_loader, cfg["data"]["sampling"]), num_samples = len(t_loader), replacement = True ) n_classes = t_loader.n_classes trainloader = data.DataLoader( t_loader, batch_size=cfg["training"]["batch_size"], num_workers=cfg["training"]["n_workers"], sampler=sampler, shuffle=sampler==None, ) valloader = data.DataLoader( v_loader, batch_size=cfg["training"]["batch_size"], num_workers=cfg["training"]["n_workers"] ) # Setup Metrics running_metrics_val = {"seg": runningScoreSeg(n_classes)} if "classifiers" in cfg["data"]: for name, classes in cfg["data"]["classifiers"].items(): running_metrics_val[name] = runningScoreClassifier( len(classes) ) if "bin_classifiers" in cfg["data"]: for name, classes in cfg["data"]["bin_classifiers"].items(): running_metrics_val[name] = runningScoreClassifier(2) # Setup Model model = get_model(cfg["model"], n_classes).to(device) total_params = sum(p.numel() for p in model.parameters()) print( 'Parameters:',total_params ) if gpu == -1: model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) else: model = torch.nn.DataParallel(model, device_ids=[gpu]) model.apply(weights_init) pretrained_path='weights/hardnet_petite_base.pth' weights = torch.load(pretrained_path) model.module.base.load_state_dict(weights) # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = {k: v for k, v in cfg["training"]["optimizer"].items() if k != "name"} optimizer = optimizer_cls(model.parameters(), **optimizer_params) print("Using optimizer {}".format(optimizer)) scheduler = get_scheduler(optimizer, cfg["training"]["lr_schedule"]) loss_dict = get_loss_function(cfg, device) if cfg["training"]["resume"] is not None: if os.path.isfile(cfg["training"]["resume"]): logger.info( "Loading model and optimizer from checkpoint '{}'".format(cfg["training"]["resume"]) ) checkpoint = torch.load(cfg["training"]["resume"], map_location=device) model.load_state_dict(checkpoint["model_state"], strict=False) if not model_only: optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) start_iter = checkpoint["epoch"] logger.info( "Loaded checkpoint '{}' (iter {})".format( cfg["training"]["resume"], checkpoint["epoch"] ) ) else: logger.info("No checkpoint found at '{}'".format(cfg["training"]["resume"])) if cfg["training"]["finetune"] is not None: if os.path.isfile(cfg["training"]["finetune"]): logger.info( "Loading model and optimizer from checkpoint '{}'".format(cfg["training"]["finetune"]) ) checkpoint = torch.load(cfg["training"]["finetune"]) model.load_state_dict(checkpoint["model_state"]) val_loss_meter = averageMeter() time_meter = averageMeter() best_iou = -100.0 i = start_iter flag = True loss_all = 0 loss_n = 0 while i <= cfg["training"]["train_iters"] and flag: for (images, label_dict, _) in trainloader: i += 1 start_ts = time.time() scheduler.step() model.train() images = images.to(device) optimizer.zero_grad() output_dict = model(images) loss = compute_loss( # considers key names in loss_dict and output_dict loss_dict, images, label_dict, output_dict, device, t_loader ) loss.backward() # backprops sum of loss tensors, frozen components will have no grad_fn optimizer.step() c_lr = scheduler.get_lr() if i%1000 == 0: # log images, seg ground truths, predictions pred_array = output_dict["seg"].data.max(1)[1].cpu().numpy() gt_array = label_dict["seg"].data.cpu().numpy() softmax_gt_array = None if "softmax" in label_dict: softmax_gt_array = label_dict["softmax"].data.max(1)[1].cpu().numpy() write_images_to_board(t_loader, images, gt_array, pred_array, i, name = 'train', softmax_gt = softmax_gt_array) if save_dir is not None: image_array = images.data.cpu().numpy().transpose(0, 2, 3, 1) write_images_to_dir(t_loader, image_array, gt_array, pred_array, i, save_dir, name = 'train', softmax_gt = softmax_gt_array) time_meter.update(time.time() - start_ts) loss_all += loss.item() loss_n += 1 if (i + 1) % cfg["training"]["print_interval"] == 0: fmt_str = "Iter [{:d}/{:d}] Loss: {:.4f} Time/Image: {:.4f} lr={:.6f}" print_str = fmt_str.format( i + 1, cfg["training"]["train_iters"], loss_all / loss_n, time_meter.avg / cfg["training"]["batch_size"], c_lr[0], ) print(print_str) logger.info(print_str) writer.add_scalar("loss/train_loss", loss.item(), i + 1) time_meter.reset() if (i + 1) % cfg["training"]["val_interval"] == 0 or (i + 1) == cfg["training"][ "train_iters" ]: torch.cuda.empty_cache() model.eval() # set batchnorm and dropouts to work in eval mode loss_all = 0 loss_n = 0 with torch.no_grad(): # Deactivate torch autograd engine, less memusage for i_val, (images_val, label_dict_val, _) in tqdm(enumerate(valloader)): images_val = images_val.to(device) output_dict = model(images_val) val_loss = compute_loss( loss_dict, images_val, label_dict_val, output_dict, device, v_loader ) val_loss_meter.update(val_loss.item()) for name, metrics in running_metrics_val.items(): gt_array = label_dict_val[name].data.cpu().numpy() if name+'_loss' in cfg['training'] and cfg['training'][name+'_loss']['name'] == 'l1': # for binary classification pred_array = output_dict[name].data.cpu().numpy() pred_array = np.sign(pred_array) pred_array[pred_array == -1] = 0 gt_array[gt_array == -1] = 0 else: pred_array = output_dict[name].data.max(1)[1].cpu().numpy() metrics.update(gt_array, pred_array) softmax_gt_array = None # log validation images pred_array = output_dict["seg"].data.max(1)[1].cpu().numpy() gt_array = label_dict_val["seg"].data.cpu().numpy() if "softmax" in label_dict_val: softmax_gt_array = label_dict_val["softmax"].data.max(1)[1].cpu().numpy() write_images_to_board(v_loader, images_val, gt_array, pred_array, i, 'validation', softmax_gt = softmax_gt_array) if save_dir is not None: images_val = images_val.cpu().numpy().transpose(0, 2, 3, 1) write_images_to_dir(v_loader, images_val, gt_array, pred_array, i, save_dir, name='validation', softmax_gt = softmax_gt_array) logger.info("Iter %d Val Loss: %.4f" % (i + 1, val_loss_meter.avg)) writer.add_scalar("loss/val_loss", val_loss_meter.avg, i + 1) for name, metrics in running_metrics_val.items(): overall, classwise = metrics.get_scores() for k, v in overall.items(): logger.info("{}_{}: {}".format(name, k, v)) writer.add_scalar("val_metrics/{}_{}".format(name, k), v, i + 1) if k == cfg["training"]["save_metric"]: curr_performance = v for metric_name, metric in classwise.items(): for k, v in metric.items(): logger.info("{}_{}_{}: {}".format(name, metric_name, k, v)) writer.add_scalar("val_metrics/{}_{}_{}".format(name, metric_name, k), v, i + 1) metrics.reset() state = { "epoch": i + 1, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), } save_path = os.path.join( writer.file_writer.get_logdir(), "{}_{}_checkpoint.pkl".format(cfg["model"]["arch"], cfg["data"]["dataset"]), ) torch.save(state, save_path) if curr_performance >= best_iou: best_iou = curr_performance state = { "epoch": i + 1, "model_state": model.state_dict(), "best_iou": best_iou, } save_path = os.path.join( writer.file_writer.get_logdir(), "{}_{}_best_model.pkl".format(cfg["model"]["arch"], cfg["data"]["dataset"]), ) torch.save(state, save_path) torch.cuda.empty_cache() if (i + 1) == cfg["training"]["train_iters"]: flag = False break
def train_net(net, cfg): log_path = cfg.OUTPUT_DIR summarize_config(cfg) optimizer = optim.Adam(net.parameters(), lr=cfg.TRAINER.LR, weight_decay=0.0005) weighted_criterion = False if cfg.MODEL.LOSS_TYPE == 'CrossEntropyLoss': criterion = cross_entropy_loss elif cfg.MODEL.LOSS_TYPE == 'SoftDiceMulticlassLoss': criterion = soft_dice_loss_multi_class elif cfg.MODEL.LOSS_TYPE == 'SoftDiceMulticlassLossDebug': criterion = soft_dice_loss_multi_class_debug elif cfg.MODEL.LOSS_TYPE == 'GeneralizedDiceLoss': criterion = generalized_soft_dice_loss_multi_class elif cfg.MODEL.LOSS_TYPE == 'JaccardLikeLoss': criterion = jaccard_like_loss_multi_class elif cfg.MODEL.LOSS_TYPE == 'ComboLoss': criterion = combo_loss weighted_criterion = cfg.TRAINER.CE_CLASS_BALANCE.ENABLED weights = 1 / torch.tensor(cfg.TRAINER.CE_CLASS_BALANCE.WEIGHTS) weights = weights.cuda() if cfg.MODEL.PRETRAINED.ENABLED: net = load_pretrained(net, cfg) if torch.cuda.device_count() > 1: print(torch.cuda.device_count(), " GPUs!") net = nn.DataParallel(net) net.to(device) bg_class = cfg.MODEL.BACKGROUND.TYPE trfm = build_transforms( cfg, for_training=True, use_gts_mask=cfg.DATASETS.LOCALIZATION_MASK.TRAIN_USE_GTS_MASK) dataset = Xview2Detectron2DamageLevelDataset(cfg.DATASETS.TRAIN[0], pre_or_post='post', include_image_weight=True, background_class=bg_class, transform=trfm) dataloader_kwargs = { 'batch_size': cfg.TRAINER.BATCH_SIZE, 'num_workers': cfg.DATALOADER.NUM_WORKER, 'shuffle': cfg.DATALOADER.SHUFFLE, 'drop_last': True, } # sampler if cfg.AUGMENTATION.IMAGE_OVERSAMPLING_TYPE == 'simple': image_p = image_sampling_weight(dataset.dataset_metadata) sampler = torch_data.WeightedRandomSampler(weights=image_p, num_samples=len(image_p)) dataloader_kwargs['sampler'] = sampler dataloader_kwargs['shuffle'] = False dataloader = torch_data.DataLoader(dataset, **dataloader_kwargs) max_epochs = cfg.TRAINER.EPOCHS global_step = 0 for epoch in range(max_epochs): start = timeit.default_timer() print('Starting epoch {}/{}.'.format(epoch + 1, max_epochs)) epoch_loss = 0 net.train() loss_set, f1_set = [], [] loss_component_set = [] positive_pixels_set = [ ] # Used to evaluated image over sampling techniques for i, batch in enumerate(dataloader): x = batch['x'].to(device) y_gts = batch['y'].to(device) image_weight = batch['image_weight'] # # TODO DEBUG # xtest = x.cpu().permute(0,2,3,1).contiguous().numpy() # ytest = y_gts.cpu().permute(0,2,3,1).contiguous().numpy() # testy = ytest[0, ..., 1:4] # ignore bg # plt.imshow(testy) # plt.savefig('test_y.png') # # postx = xtest[0, ..., :3] # plt.imshow(postx) # plt.savefig('test_post.png') # # prex = xtest[0, ..., 4:7] # plt.imshow(prex) # plt.savefig('test_pre.png') # # TODO END DEBUGn optimizer.zero_grad() y_pred = net(x) ce_loss = 0 dice_loss = 0 if weighted_criterion: loss, (ce_loss, dice_loss) = criterion(y_pred, y_gts, weights) else: loss = criterion(y_pred, y_gts) epoch_loss += loss.item() loss.backward() optimizer.step() loss_set.append(loss.item()) # loss_component_set.append(loss_component.cpu().detach().numpy()) positive_pixels_set.extend(image_weight.cpu().numpy()) if global_step % 10000 == 0 and global_step > 0: check_point_name = f'cp_{global_step}.pkl' save_path = os.path.join(log_path, check_point_name) torch.save(net.state_dict(), save_path) if global_step % 100 == 0 and global_step > 0: # time per 100 steps stop = timeit.default_timer() time_per_n_batches = stop - start max_mem, max_cache = gpu_stats() print( f'step {global_step}, avg loss: {np.mean(loss_set):.4f}, cuda mem: {max_mem} MB, cuda cache: {max_cache} MB, time: {time_per_n_batches:.2f}s', flush=True) log_data = { 'loss': np.mean(loss_set), 'ce_component_loss': ce_loss, 'dice_component_loss': dice_loss, 'gpu_memory': max_mem, 'time': time_per_n_batches, 'total_positive_pixels': np.mean(positive_pixels_set), 'step': global_step, } wandb.log(log_data) loss_set = [] positive_pixels_set = [] start = stop # torch.cuda.empty_cache() global_step += 1 # Evaluation for multiclass F1 score dmg_model_eval(net, cfg, device, max_samples=100, step=global_step, epoch=epoch) dmg_model_eval(net, cfg, device, max_samples=100, run_type='TRAIN', step=global_step, epoch=epoch)
def train_net(net, cfg): log_path = cfg.OUTPUT_DIR writer = SummaryWriter(log_path) run_config = {} run_config['CONFIG_NAME'] = cfg.NAME run_config['device'] = device run_config['log_path'] = cfg.OUTPUT_DIR run_config['training_set'] = cfg.DATASETS.TRAIN run_config['test set'] = cfg.DATASETS.TEST run_config['epochs'] = cfg.TRAINER.EPOCHS run_config['learning rate'] = cfg.TRAINER.LR run_config['batch size'] = cfg.TRAINER.BATCH_SIZE table = { 'run config name': run_config.keys(), ' ': run_config.values(), } print(tabulate( table, headers='keys', tablefmt="fancy_grid", )) optimizer = optim.Adam(net.parameters(), lr=cfg.TRAINER.LR, weight_decay=0.0005) if cfg.MODEL.LOSS_TYPE == 'BCEWithLogitsLoss': criterion = nn.BCEWithLogitsLoss() elif cfg.MODEL.LOSS_TYPE == 'CrossEntropyLoss': balance_weight = [cfg.MODEL.NEGATIVE_WEIGHT, cfg.MODEL.POSITIVE_WEIGHT] balance_weight = torch.tensor(balance_weight).float().to(device) criterion = nn.CrossEntropyLoss(weight=balance_weight) elif cfg.MODEL.LOSS_TYPE == 'SoftDiceLoss': criterion = soft_dice_loss elif cfg.MODEL.LOSS_TYPE == 'SoftDiceBalancedLoss': criterion = soft_dice_loss_balanced elif cfg.MODEL.LOSS_TYPE == 'JaccardLikeLoss': criterion = jaccard_like_loss elif cfg.MODEL.LOSS_TYPE == 'ComboLoss': criterion = lambda pred, gts: F.binary_cross_entropy_with_logits( pred, gts) + soft_dice_loss(pred, gts) elif cfg.MODEL.LOSS_TYPE == 'WeightedComboLoss': criterion = lambda pred, gts: 2 * F.binary_cross_entropy_with_logits( pred, gts) + soft_dice_loss(pred, gts) elif cfg.MODEL.LOSS_TYPE == 'FrankensteinLoss': criterion = lambda pred, gts: F.binary_cross_entropy_with_logits( pred, gts) + jaccard_like_balanced_loss(pred, gts) elif cfg.MODEL.LOSS_TYPE == 'FrankensteinEdgeLoss': criterion = frankenstein_edge_loss if torch.cuda.device_count() > 1: print(torch.cuda.device_count(), " GPUs!") net = nn.DataParallel(net) net.to(device) global_step = 0 epochs = cfg.TRAINER.EPOCHS use_edge_loss = cfg.MODEL.LOSS_TYPE == 'FrankensteinEdgeLoss' for name, _ in net.named_parameters(): print(name) trfm = [] trfm.append(BGR2RGB()) if cfg.DATASETS.USE_CLAHE_VARI: trfm.append(VARI()) if cfg.AUGMENTATION.RESIZE: trfm.append(Resize(scale=cfg.AUGMENTATION.RESIZE_RATIO)) if cfg.AUGMENTATION.CROP_TYPE == 'uniform': trfm.append(UniformCrop(crop_size=cfg.AUGMENTATION.CROP_SIZE)) elif cfg.AUGMENTATION.CROP_TYPE == 'importance': trfm.append(ImportanceRandomCrop(crop_size=cfg.AUGMENTATION.CROP_SIZE)) if cfg.AUGMENTATION.RANDOM_FLIP_ROTATE: trfm.append(RandomFlipRotate()) trfm.append(Npy2Torch()) trfm = transforms.Compose(trfm) # reset the generators dataset = Xview2Detectron2Dataset( cfg.DATASETS.TRAIN[0], pre_or_post=cfg.DATASETS.PRE_OR_POST, include_image_weight=True, transform=trfm, include_edge_mask=use_edge_loss, edge_mask_type=cfg.MODEL.EDGE_WEIGHTED_LOSS.TYPE, use_clahe=cfg.DATASETS.USE_CLAHE_VARI, ) dataloader_kwargs = { 'batch_size': cfg.TRAINER.BATCH_SIZE, 'num_workers': cfg.DATALOADER.NUM_WORKER, 'shuffle': cfg.DATALOADER.SHUFFLE, 'drop_last': True, 'pin_memory': True, } # sampler if cfg.AUGMENTATION.IMAGE_OVERSAMPLING_TYPE == 'simple': image_p = image_sampling_weight(dataset.dataset_metadata) sampler = torch_data.WeightedRandomSampler(weights=image_p, num_samples=len(image_p)) dataloader_kwargs['sampler'] = sampler dataloader_kwargs['shuffle'] = False dataloader = torch_data.DataLoader(dataset, **dataloader_kwargs) for epoch in range(epochs): start = timeit.default_timer() print('Starting epoch {}/{}.'.format(epoch + 1, epochs)) epoch_loss = 0 net.train() # mean AP, mean AUC, max F1 mAP_set_train, mAUC_set_train, maxF1_train = [], [], [] loss_set, f1_set = [], [] positive_pixels_set = [ ] # Used to evaluated image over sampling techniques for i, batch in enumerate(dataloader): optimizer.zero_grad() x = batch['x'].to(device) y_gts = batch['y'].to(device) image_weight = batch['image_weight'] y_pred = net(x) if cfg.MODEL.LOSS_TYPE == 'CrossEntropyLoss': # y_pred = y_pred # Cross entropy loss doesn't like single channel dimension y_gts = y_gts.long( ) # Cross entropy loss requires a long as target if use_edge_loss: edge_mask = y_gts[:, [0]] y_gts = y_gts[:, 1:] edge_loss_scale = edge_loss_warmup_schedule(cfg, global_step) loss, ce_loss, jaccard_loss, edge_loss = criterion( y_pred, y_gts, edge_mask, edge_loss_scale) wandb.log({ 'ce_loss': ce_loss, 'jaccard_loss': jaccard_loss, 'edge_loss': edge_loss, 'step': global_step, 'edge_loss_scale': edge_loss_scale, }) else: loss = criterion(y_pred, y_gts) epoch_loss += loss.item() loss.backward() optimizer.step() loss_set.append(loss.item()) positive_pixels_set.extend(image_weight.cpu().numpy()) if global_step % 100 == 0 or global_step == 0: # time per 100 steps stop = timeit.default_timer() time_per_n_batches = stop - start if global_step % 10000 == 0 and global_step > 0: check_point_name = f'cp_{global_step}.pkl' save_path = os.path.join(log_path, check_point_name) torch.save(net.state_dict(), save_path) # Averaged loss and f1 writer # writer.add_scalar('f1/train', np.mean(f1_set), global_step) max_mem, max_cache = gpu_stats() print( f'step {global_step}, avg loss: {np.mean(loss_set):.4f}, cuda mem: {max_mem} MB, cuda cache: {max_cache} MB, time: {time_per_n_batches:.2f}s', flush=True) wandb.log({ 'loss': np.mean(loss_set), 'gpu_memory': max_mem, 'time': time_per_n_batches, 'total_positive_pixels': np.mean(positive_pixels_set), 'step': global_step, }) loss_set = [] positive_pixels_set = [] start = stop # torch.cuda.empty_cache() global_step += 1 if epoch % 2 == 0: # Evaluation after every other epoch model_eval(net, cfg, device, max_samples=100, step=global_step, epoch=epoch) model_eval(net, cfg, device, max_samples=100, run_type='TRAIN', step=global_step, epoch=epoch)
def get_sampler(self): self.sampler = data.WeightedRandomSampler( torch.tensor(self.priority_weights.cpu()), self.num_samples) return self.sampler