Beispiel #1
0
def evalAllCLF(inputdata, clf, batchsize=1, device=None):
    r"""evaluate a classifier with the whole dataset

    Args:
        inputdata (torchvision.datasets): dataset to be evaluated
        clf (callable obj): the classifier you want to test
        batchsize (int, optional): related to your RAM. Defaults to 1.
        device (torch.device, optional): test on which device. This should
            be on the same device as the 'clf' does.
            If left undefined, it will try to match with the 'clf' or current
            default device

    Returns:
        (1D tensor, 1D tensor): prediction & ground truth

    .. note::
        If clf is a torch.nn.Module, 'device' will be disregarded
        And data will always be pass to where clf is located
    """
    isTrain = False
    if isinstance(clf, torch.nn.Module):  # Special treatment for NN
        isTrain = clf.training  # mark model's original state
        clf.eval()  # set net into eval mode
        device = next(clf.parameters()).device  # fetch where is the model

    # Build data loader
    loader = torch.utils.data.DataLoader(inputdata,
                                         batch_size=batchsize,
                                         shuffle=False,
                                         num_workers=2)
    # Build progress bar
    pb = tqdm(total=len(inputdata),
              desc=f"Evaluating NN on {device}: ",
              leave=True,
              ascii=(os.name == "nt"))
    ind = 0
    pred, gt = None, None
    for data in loader:
        inputs, labels = data[0].to(device), data[1].to(device)
        _, predicted = torch.max(clf(inputs).detach(), 1)
        if pred is None:
            # pre-allocate memory for storing results
            pred = torch.empty(len(inputdata),
                               dtype=predicted.dtype,
                               device=device)
            gt = torch.empty(len(inputdata), dtype=labels.dtype, device=device)
        pred[ind:ind + len(labels)] = predicted
        gt[ind:ind + len(labels)] = labels
        ind += len(labels)
        pb.update(len(labels))
    pb.close()

    if isTrain:
        clf.train()

    return pred, gt
Beispiel #2
0
def gen_bar_updater():
    pbar = tqdm(total=None)

    def bar_update(count, block_size, total_size):
        if pbar.total is None and total_size:
            pbar.total = total_size
        progress_bytes = count * block_size
        pbar.update(progress_bytes - pbar.n)

    return bar_update
Beispiel #3
0
def runParallelTqdm(func, arglist, workers=1):
    """Handle multiple tasks with tqdm bar in parallel.
       The function to be run must include keyword argument "vid",
       which should be passed to tqdm's position.

    Args:
        func (callable): The function you want to run in parallel
                        example: func(**kwarg, vid)
        arglist (dict/list of dict): arguments for specified function.
                        should be a list of keyword dictionaries.

        workers (int, optional): The number of processes run in parallel
                        At least 1, won't exceed the number of cpu cores.

    Returns:
        [list]: returns of your function in the same order of the arglist
    """
    if not isinstance(arglist, list):
        arglist = [arglist]
    workers = min(max(workers, 1), os.cpu_count())

    slotManager = Manager()
    opened = slotManager.list(range(workers - 1, -1, -1))
    filled = slotManager.dict()

    pb = tqdm(total=len(arglist),
              desc="Overall",
              leave=True,
              position=workers,
              ascii=(os.name == "nt"),
              unit="task",
              mininterval=0.2)

    executor = ProcessPoolExecutor(max_workers=workers)
    tasks = [
        executor.submit(_worker, func, args, opened, filled)
        for args in arglist
    ]

    for _ in as_completed(tasks):
        # Adjust Overall progress bar position
        if len(executor._pending_work_items) < workers:
            pb.clear()
            pb.pos = (-max(filled.values()) - 1) if filled else 0
        pb.refresh()
        pb.update(1)

    executor.shutdown(wait=True)
    pb.close()
    return [task.result() for task in tasks]
Beispiel #4
0
def _genGaussNoise(inset, outfolder, var=0.1, vid=0):
    r"""generate a single dirty dataset contaminated by Gaussian noise
        **Internal use only**

    Args:
        inset (VDPlus): A VDPlus object, your input set
        outfolder (Path): output directory
        var (float): 0~1, the variance of the noise
    """
    outfolder = Path(outfolder)
    # Create folder if not exist
    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    # Init output set
    outset = VDPlus(str(outfolder), tags=inset.classes)
    outset.img_type = inset.img_type
    outset.classes_count = inset.classes_count
    outset.targets = inset.targets

    # Set progress bar
    pb = tqdm(total=len(inset),
              desc=f'Process "{outfolder.name}"',
              leave=True,
              position=vid,
              ascii=(os.name == "nt"),
              mininterval=0.3)

    _ram = not isinstance(inset.data[0], Path)

    # Process data
    for (_img, _), _ori in zip(inset, inset.data):
        _img = F.to_tensor(_img)
        _img.add_(torch.randn(_img.size()), alpha=var)
        _img.clamp_(0, 1)
        _img = F.to_pil_image(_img)
        if _ram:
            outset.data.append(_img)
        else:
            _img.save(outfolder / _ori)

        pb.update()
    pb.close()

    if _ram:
        # Save output set if it is on ram
        outset.makeCache(outfolder.name)
        outset.dumpMeta(outfolder.name)
Beispiel #5
0
    def trainNet(self,
                 dataset,
                 epoch=1,
                 batchsize=10,
                 workers=4,
                 logPE=10,
                 optimizer=None,
                 criterion=None,
                 loger=print):
        r"""Train your net with decency

        Args:
            dataset (torch dataset or dataloader): Data for training.
            epoch (int, optional): Defaults to 1.
            batchsize (int, optional): Defaults to 10.
            workers (int, optional): Multithread loader. Defaults to 4.
            logPE (int, optional): Number of logs per epoch. Defaults to 10.
            optimizer (torch.optim.optimizer): Defaults to default Adam.
            criterion (torch.nn.lose): Defaults to nn.CrossEntropyLoss().
            loger (print like function): Could be a custom print function.

        .. note::
            If a dataloader is set, batchsize and workers will be neglected.
        """
        device = next(self.parameters()).device  # fetch where is the model
        # Prepare dataset
        if device == torch.device("cpu"):
            pind = True
        else:
            pind = bool(workers)

        if isinstance(dataset, torch.utils.data.Dataset):
            loader = torch.utils.data.DataLoader(dataset,
                                                 batch_size=batchsize,
                                                 pin_memory=pind,
                                                 shuffle=True,
                                                 num_workers=workers)
        elif isinstance(dataset, torch.utils.data.DataLoader):
            loader = dataset
        else:
            raise ValueError("Invalid training data. Should be either \
a torch dataset or a torch dataloader")

        # Assign default optimizer
        if not optimizer:
            optimizer = Adam(self.parameters())

        # Assign default criterion
        if not criterion:
            criterion = nn.CrossEntropyLoss()

        isTrain = self.training  # mark model's original state
        self.train()  # set model into training mode

        # Log info
        brief = "\n".join(f"{k}: {v}" for k, v in optimizer.defaults.items())
        logstr = f"""Start training...
==========Training Brief===========
Epoch: {epoch}
Batch size: {loader.batch_size}
Num of loaders: {workers}
Loss func: {type(criterion).__name__}
Optimizer: {type(optimizer).__name__}
==========Optimizer Brief==========
{brief}
===========Dataset Brief===========
{loader.dataset}
=========Model definition=========
{self.printFUN()}
=================================="""
        if isinstance(loger, printLog):
            loger(logstr, t=True)
        else:
            loger(logstr)

        # Start training
        # Build progress bar
        pb = tqdm(total=epoch * len(dataset),
                  desc=f"Training NN on {device}: ",
                  leave=True,
                  ascii=(os.name == "nt"),
                  mininterval=0.3)
        pwrite = bool(logPE)
        logPE = int(max(len(loader) / (logPE if pwrite else 100), 1))
        for ep in range(epoch):
            running_loss = 0.0
            for i, data in enumerate(loader, 0):
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = data[0].to(device), data[1].to(device)
                # zero the parameter gradients
                self.zero_grad()

                # forward + backward + optimize
                outputs = self(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()
                if (logPE == 1) or (i % logPE == logPE - 1):
                    if pwrite:
                        logstr = f"\
Ep {ep+1}/{epoch} - trained {(i+1)*loader.batch_size}:  \
loss_{round(running_loss / logPE, 3)}"

                        if isinstance(loger, printLog):
                            loger(logstr, redirect=True, t=True)
                        pb.write(logstr)
                    else:
                        pb.set_postfix_str(f"Loss: {running_loss/logPE:.3f}")
                    running_loss = 0.0
                pb.update(len(labels))
        pb.close()
        logstr = pb.format_interval(pb.format_dict["elapsed"])
        if isinstance(loger, printLog):
            loger(logstr, t=True)
        else:
            loger(logstr)

        # reset model to its original state
        if not isTrain:
            self.eval()
Beispiel #6
0
        optimizer.load_state_dict(checkpoint["optimizer"])

    # scheduler = lr_scheduler.ReduceLROnPlateau(
    #     optimizer, mode="min", patience=3, verbose=True, factor=0.2)
    # train and print auc score for all epochs^

    if is_model_trained:
        start_epoch = checkpoint["epoch"]
        end_epoch = checkpoint["epoch"] + epochs
    else:
        start_epoch = 0
        end_epoch = epochs

    train_losses = []
    val_losses = []
    for epoch in tqdm(range(start_epoch, end_epoch)):
        print("epoch " + str(epoch))
        train_loss = engine.train(train_loader,
                                  model,
                                  optimizer,
                                  device=device)
        val_loss = engine.evaluate(valid_loader, model, device=device)
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        writer.add_scalar("train", train_loss, epoch)
        writer.add_scalar("val", val_loss, epoch)
        writer.add_scalars("train and val losses", {
            "train": train_loss,
            "val": val_loss
        }, epoch)
Beispiel #7
0
def _genWrongLabel(inset, outfolder, ratio, vid=0):
    r"""generate a single dirty dataset contaminated by wrong labels
        **Internal use only**

    Args:
        inset (VDPlus): A VDPlus with an OrderedDict attribute "classified"
                        inset can be generated by "inset.inspectSet()"
        outfolder (Path): output directory
        ratio (float): 0~1, the ratio of the contamination
    """
    outfolder = Path(outfolder)
    # Create folder if not exist
    if not outfolder.exists():
        outfolder.mkdir(parents=True)

    # Init output set
    outset = VDPlus(str(outfolder), tags=inset.classes)
    outset.img_type = inset.img_type
    outset.classes_count = dict.fromkeys(outset.classes, 0)

    # Set progress bar
    pb = tqdm(total=len(inset),
              desc=f'Process "{outfolder.name}"',
              leave=True,
              position=vid,
              ascii=(os.name == "nt"),
              mininterval=0.3)

    # Process data, iter through each class
    for target, (tag, samples) in enumerate(inset.classified.items()):
        # Randomize samples
        samples = samples.copy()
        shuffle(samples)

        # Cal the number for changed and unchanged
        _changed = round(len(samples) * ratio)
        _unchanged = len(samples) - _changed

        # Attach unchanged part
        outset.data.extend(samples)
        outset.targets.extend([target] * _unchanged)
        outset.classes_count[tag] += _unchanged
        pb.update(_unchanged)

        # Attach changed part
        w_label = []  # Generate false candidates
        for item in range(len(outset.classes)):
            if item != target:
                w_label.append(item)

        _n_targets = choices(w_label, k=_changed)
        outset.targets.extend(_n_targets)
        for item in _n_targets:
            outset.classes_count[outset.classes[item]] += 1
            pb.update()
    pb.close()

    if isinstance(inset.data[0], Path):
        # Save output set if it's not on ram
        outset.root = inset.root
        setUnpack(outset, outfolder=outfolder, vid=vid)
    else:
        # Save output set if it is on ram
        outset.root = outfolder
        outset.makeCache(outfolder.name)
        outset.dumpMeta(outfolder.name)