def evalAllCLF(inputdata, clf, batchsize=1, device=None): r"""evaluate a classifier with the whole dataset Args: inputdata (torchvision.datasets): dataset to be evaluated clf (callable obj): the classifier you want to test batchsize (int, optional): related to your RAM. Defaults to 1. device (torch.device, optional): test on which device. This should be on the same device as the 'clf' does. If left undefined, it will try to match with the 'clf' or current default device Returns: (1D tensor, 1D tensor): prediction & ground truth .. note:: If clf is a torch.nn.Module, 'device' will be disregarded And data will always be pass to where clf is located """ isTrain = False if isinstance(clf, torch.nn.Module): # Special treatment for NN isTrain = clf.training # mark model's original state clf.eval() # set net into eval mode device = next(clf.parameters()).device # fetch where is the model # Build data loader loader = torch.utils.data.DataLoader(inputdata, batch_size=batchsize, shuffle=False, num_workers=2) # Build progress bar pb = tqdm(total=len(inputdata), desc=f"Evaluating NN on {device}: ", leave=True, ascii=(os.name == "nt")) ind = 0 pred, gt = None, None for data in loader: inputs, labels = data[0].to(device), data[1].to(device) _, predicted = torch.max(clf(inputs).detach(), 1) if pred is None: # pre-allocate memory for storing results pred = torch.empty(len(inputdata), dtype=predicted.dtype, device=device) gt = torch.empty(len(inputdata), dtype=labels.dtype, device=device) pred[ind:ind + len(labels)] = predicted gt[ind:ind + len(labels)] = labels ind += len(labels) pb.update(len(labels)) pb.close() if isTrain: clf.train() return pred, gt
def gen_bar_updater(): pbar = tqdm(total=None) def bar_update(count, block_size, total_size): if pbar.total is None and total_size: pbar.total = total_size progress_bytes = count * block_size pbar.update(progress_bytes - pbar.n) return bar_update
def runParallelTqdm(func, arglist, workers=1): """Handle multiple tasks with tqdm bar in parallel. The function to be run must include keyword argument "vid", which should be passed to tqdm's position. Args: func (callable): The function you want to run in parallel example: func(**kwarg, vid) arglist (dict/list of dict): arguments for specified function. should be a list of keyword dictionaries. workers (int, optional): The number of processes run in parallel At least 1, won't exceed the number of cpu cores. Returns: [list]: returns of your function in the same order of the arglist """ if not isinstance(arglist, list): arglist = [arglist] workers = min(max(workers, 1), os.cpu_count()) slotManager = Manager() opened = slotManager.list(range(workers - 1, -1, -1)) filled = slotManager.dict() pb = tqdm(total=len(arglist), desc="Overall", leave=True, position=workers, ascii=(os.name == "nt"), unit="task", mininterval=0.2) executor = ProcessPoolExecutor(max_workers=workers) tasks = [ executor.submit(_worker, func, args, opened, filled) for args in arglist ] for _ in as_completed(tasks): # Adjust Overall progress bar position if len(executor._pending_work_items) < workers: pb.clear() pb.pos = (-max(filled.values()) - 1) if filled else 0 pb.refresh() pb.update(1) executor.shutdown(wait=True) pb.close() return [task.result() for task in tasks]
def _genGaussNoise(inset, outfolder, var=0.1, vid=0): r"""generate a single dirty dataset contaminated by Gaussian noise **Internal use only** Args: inset (VDPlus): A VDPlus object, your input set outfolder (Path): output directory var (float): 0~1, the variance of the noise """ outfolder = Path(outfolder) # Create folder if not exist if not outfolder.exists(): outfolder.mkdir(parents=True) # Init output set outset = VDPlus(str(outfolder), tags=inset.classes) outset.img_type = inset.img_type outset.classes_count = inset.classes_count outset.targets = inset.targets # Set progress bar pb = tqdm(total=len(inset), desc=f'Process "{outfolder.name}"', leave=True, position=vid, ascii=(os.name == "nt"), mininterval=0.3) _ram = not isinstance(inset.data[0], Path) # Process data for (_img, _), _ori in zip(inset, inset.data): _img = F.to_tensor(_img) _img.add_(torch.randn(_img.size()), alpha=var) _img.clamp_(0, 1) _img = F.to_pil_image(_img) if _ram: outset.data.append(_img) else: _img.save(outfolder / _ori) pb.update() pb.close() if _ram: # Save output set if it is on ram outset.makeCache(outfolder.name) outset.dumpMeta(outfolder.name)
def trainNet(self, dataset, epoch=1, batchsize=10, workers=4, logPE=10, optimizer=None, criterion=None, loger=print): r"""Train your net with decency Args: dataset (torch dataset or dataloader): Data for training. epoch (int, optional): Defaults to 1. batchsize (int, optional): Defaults to 10. workers (int, optional): Multithread loader. Defaults to 4. logPE (int, optional): Number of logs per epoch. Defaults to 10. optimizer (torch.optim.optimizer): Defaults to default Adam. criterion (torch.nn.lose): Defaults to nn.CrossEntropyLoss(). loger (print like function): Could be a custom print function. .. note:: If a dataloader is set, batchsize and workers will be neglected. """ device = next(self.parameters()).device # fetch where is the model # Prepare dataset if device == torch.device("cpu"): pind = True else: pind = bool(workers) if isinstance(dataset, torch.utils.data.Dataset): loader = torch.utils.data.DataLoader(dataset, batch_size=batchsize, pin_memory=pind, shuffle=True, num_workers=workers) elif isinstance(dataset, torch.utils.data.DataLoader): loader = dataset else: raise ValueError("Invalid training data. Should be either \ a torch dataset or a torch dataloader") # Assign default optimizer if not optimizer: optimizer = Adam(self.parameters()) # Assign default criterion if not criterion: criterion = nn.CrossEntropyLoss() isTrain = self.training # mark model's original state self.train() # set model into training mode # Log info brief = "\n".join(f"{k}: {v}" for k, v in optimizer.defaults.items()) logstr = f"""Start training... ==========Training Brief=========== Epoch: {epoch} Batch size: {loader.batch_size} Num of loaders: {workers} Loss func: {type(criterion).__name__} Optimizer: {type(optimizer).__name__} ==========Optimizer Brief========== {brief} ===========Dataset Brief=========== {loader.dataset} =========Model definition========= {self.printFUN()} ==================================""" if isinstance(loger, printLog): loger(logstr, t=True) else: loger(logstr) # Start training # Build progress bar pb = tqdm(total=epoch * len(dataset), desc=f"Training NN on {device}: ", leave=True, ascii=(os.name == "nt"), mininterval=0.3) pwrite = bool(logPE) logPE = int(max(len(loader) / (logPE if pwrite else 100), 1)) for ep in range(epoch): running_loss = 0.0 for i, data in enumerate(loader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data[0].to(device), data[1].to(device) # zero the parameter gradients self.zero_grad() # forward + backward + optimize outputs = self(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if (logPE == 1) or (i % logPE == logPE - 1): if pwrite: logstr = f"\ Ep {ep+1}/{epoch} - trained {(i+1)*loader.batch_size}: \ loss_{round(running_loss / logPE, 3)}" if isinstance(loger, printLog): loger(logstr, redirect=True, t=True) pb.write(logstr) else: pb.set_postfix_str(f"Loss: {running_loss/logPE:.3f}") running_loss = 0.0 pb.update(len(labels)) pb.close() logstr = pb.format_interval(pb.format_dict["elapsed"]) if isinstance(loger, printLog): loger(logstr, t=True) else: loger(logstr) # reset model to its original state if not isTrain: self.eval()
optimizer.load_state_dict(checkpoint["optimizer"]) # scheduler = lr_scheduler.ReduceLROnPlateau( # optimizer, mode="min", patience=3, verbose=True, factor=0.2) # train and print auc score for all epochs^ if is_model_trained: start_epoch = checkpoint["epoch"] end_epoch = checkpoint["epoch"] + epochs else: start_epoch = 0 end_epoch = epochs train_losses = [] val_losses = [] for epoch in tqdm(range(start_epoch, end_epoch)): print("epoch " + str(epoch)) train_loss = engine.train(train_loader, model, optimizer, device=device) val_loss = engine.evaluate(valid_loader, model, device=device) train_losses.append(train_loss) val_losses.append(val_loss) writer.add_scalar("train", train_loss, epoch) writer.add_scalar("val", val_loss, epoch) writer.add_scalars("train and val losses", { "train": train_loss, "val": val_loss }, epoch)
def _genWrongLabel(inset, outfolder, ratio, vid=0): r"""generate a single dirty dataset contaminated by wrong labels **Internal use only** Args: inset (VDPlus): A VDPlus with an OrderedDict attribute "classified" inset can be generated by "inset.inspectSet()" outfolder (Path): output directory ratio (float): 0~1, the ratio of the contamination """ outfolder = Path(outfolder) # Create folder if not exist if not outfolder.exists(): outfolder.mkdir(parents=True) # Init output set outset = VDPlus(str(outfolder), tags=inset.classes) outset.img_type = inset.img_type outset.classes_count = dict.fromkeys(outset.classes, 0) # Set progress bar pb = tqdm(total=len(inset), desc=f'Process "{outfolder.name}"', leave=True, position=vid, ascii=(os.name == "nt"), mininterval=0.3) # Process data, iter through each class for target, (tag, samples) in enumerate(inset.classified.items()): # Randomize samples samples = samples.copy() shuffle(samples) # Cal the number for changed and unchanged _changed = round(len(samples) * ratio) _unchanged = len(samples) - _changed # Attach unchanged part outset.data.extend(samples) outset.targets.extend([target] * _unchanged) outset.classes_count[tag] += _unchanged pb.update(_unchanged) # Attach changed part w_label = [] # Generate false candidates for item in range(len(outset.classes)): if item != target: w_label.append(item) _n_targets = choices(w_label, k=_changed) outset.targets.extend(_n_targets) for item in _n_targets: outset.classes_count[outset.classes[item]] += 1 pb.update() pb.close() if isinstance(inset.data[0], Path): # Save output set if it's not on ram outset.root = inset.root setUnpack(outset, outfolder=outfolder, vid=vid) else: # Save output set if it is on ram outset.root = outfolder outset.makeCache(outfolder.name) outset.dumpMeta(outfolder.name)