コード例 #1
0
    def wrapper(*args, **kwargs):
        from src.utils.check import check_all

        check_all()

        import src.utils.logger as log

        log.init()

        func(*args, **kwargs)

        log.shutdown()
コード例 #2
0
 def setUp(self) -> None:
     logger.init()
コード例 #3
0
ファイル: train_model.py プロジェクト: YaNgZhAnG-V5/RoarTorch
def train_and_evaluate_model(arguments):
    """
    Main Pipeline for training and cross-validation.
    """
    """ Setup result directory and enable logging to file in it """
    logger.init(arguments['outdir'],
                filename_prefix='train_cls',
                log_level=logging.INFO)  # keep logs at root dir.
    outdir = os.path.join(arguments['outdir'], 'train_cls')
    os.makedirs(outdir, exist_ok=True)
    logger.info('Arguments:\n{}'.format(pformat(arguments)))
    """ Set random seed throughout python"""
    utils.set_random_seed(random_seed=arguments['random_seed'])
    """ Create tensorboard writer """
    tb_writer = initialize_tensorboard(outdir)
    """ Set device - cpu or gpu """
    device = torch.device(
        arguments['cuda_device'] if torch.cuda.is_available() else "cpu")
    logger.info(f'Using device - {device}')
    """ Load parameters for the Dataset """
    dataset = create_dataset(arguments['dataset_args'],
                             arguments['train_data_args'],
                             arguments['val_data_args'])
    """ Load Model with weights(if available) """
    model: torch.nn.Module = models_utils.get_model(
        arguments.get('model_args'), device,
        arguments['dataset_args']).to(device)
    """ Create optimizer and scheduler """
    optimizer = optimizer_utils.create_optimizer(model.parameters(),
                                                 arguments['optimizer_args'])
    lr_scheduler: _LRScheduler = optimizer_utils.create_scheduler(
        optimizer, arguments['scheduler_args'])
    """ Create loss function """
    logger.info(f"Loss weights {dataset.pos_neg_balance_weights()}")
    criterion = loss_utils.create_loss(arguments['loss_args'])
    """ Sample and View the inputs to model """
    dataset.debug()
    """ Pipeline - loop over the dataset multiple times """
    max_validation_acc, best_validation_model_path = 0, None
    batch_index = 0
    nb_epochs = 1 if is_debug_mode() else arguments['nb_epochs']
    for epoch in range(nb_epochs):
        """ Train the model """
        logger.info(f"Training, Epoch {epoch + 1}/{nb_epochs}")
        train_dataloader = dataset.train_dataloader
        model.train()
        start = time.time()
        total, correct = 0, 0
        epoch_loss = 0
        for i, data in enumerate(tqdm(train_dataloader)):
            # get the inputs
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # Forward Pass
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()

            tb_writer.save_scalar('batch_training_loss', loss.item(),
                                  batch_index)
            batch_index += 1
            epoch_loss += loss.item() * labels.size(0)
            total += labels.size(0)

            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

            optimizer.step()

        epoch_loss = epoch_loss / total
        logger.info(f"Epoch = {epoch}, Train_loss = {epoch_loss}, "
                    f"Time taken = {time.time() - start} seconds.")

        logger.info(f"Train_accuracy = {100 * correct / total}")
        tb_writer.save_scalar('training_loss', epoch_loss, epoch)
        tb_writer.save_scalar('training_acc', 100 * correct / total, epoch)
        """ Validate the model """
        val_data_args = arguments['val_data_args']
        if val_data_args['validate_step_size'] > 0 and \
                epoch % val_data_args['validate_step_size'] == 0:

            model.eval()
            validation_dataloader = dataset.validation_dataloader
            logger.info(
                f"Validation, Epoch {epoch + 1}/{arguments['nb_epochs']}")

            val_loss, val_accuracy = evaluate_single_class(
                device, model, validation_dataloader, criterion)
            logger.info(f'validation images: {dataset.val_dataset_size}, '
                        f'val_auc : {val_accuracy} %% '
                        f'val_loss: {val_loss}')
            tb_writer.save_scalar('validation_acc', val_accuracy, epoch)
            tb_writer.save_scalar('validation_loss', val_loss, epoch)
            """ Save Model """
            if val_accuracy > max_validation_acc:
                max_validation_acc = val_accuracy
                if best_validation_model_path is not None:
                    os.remove(best_validation_model_path)
                best_validation_model_path = os.path.join(
                    outdir,
                    f'epoch_{epoch:04}-model-val_acc_{val_accuracy}.pth')
                torch.save(model.state_dict(), best_validation_model_path)
                logger.info(f'Model saved at: {best_validation_model_path}')

        if lr_scheduler:
            prev_lr = lr_scheduler.get_last_lr()
            lr_scheduler.step()
            if lr_scheduler.get_last_lr() != prev_lr:
                logger.warn(
                    f'Updated LR from {prev_lr} to {lr_scheduler.get_last_lr()}'
                )

    logger.info('Finished Training')
    logger.info(f'Max Validation accuracy is {max_validation_acc}')
    """ Create a symbolic link to the best model at a static path 'best_model.pth' """
    symlink_path = os.path.join(outdir, 'best_model.pth')
    if os.path.islink(symlink_path):
        os.unlink(symlink_path)
    os.symlink(best_validation_model_path.rsplit('/')[-1], symlink_path)
    logger.info(
        f'Best Model saved at: {best_validation_model_path}. and symlink to {symlink_path}'
    )
    """ Evaluate model on test set """
    model.load_state_dict(torch.load(best_validation_model_path), strict=False)
    test_dataloader = dataset.test_dataloader
    test_loss, test_accuracy = evaluate_single_class(device, model,
                                                     test_dataloader,
                                                     criterion)
    logger.info(
        f'Accuracy of the network on the {dataset.test_dataset_size} test images: {test_accuracy} %%'
    )
    return test_loss, test_accuracy
コード例 #4
0
def dump_saliency_data():
    """
    Main Pipeline for training and cross-validation.
    """

    parser = argparse.ArgumentParser(description="config")
    parser.add_argument("--config", type=str, default="config/roar_cifar10_resnet8.yml",
                        help="Configuration file to use.")
    args = parser.parse_args()
    with open(args.config) as fp:
        cfg = yaml.load(fp, Loader=Loader)

    roar_core.validate_configuration(cfg, validate_attribution_methods=True)

    # Common Configuration
    dataset_args = cfg['data']

    train_data_args = dict(
        batch_size=4,
        shuffle=False,
        enable_augmentation=False,
    )

    assert not train_data_args['enable_augmentation'], \
        'Augmentation of dataset should be disabled for generating dataset'

    val_data_args = dict(
        batch_size=4,
        shuffle=False,
    )

    # Shuffling should be off
    assert not val_data_args['shuffle']

    arguments = dict(
        dataset_args=dataset_args,
        train_data_args=train_data_args,
        val_data_args=val_data_args,
        model_args=cfg['extract_cams']['model'],
        outdir=cfg['outdir'],

    )

    """ Setup result directory """
    outdir = os.path.join(arguments.get("outdir"), 'extract_cams')
    logger.init(outdir, filename_prefix='extract_cams', log_level=logging.INFO)  # keep logs at root dir.
    logger.info('Arguments:\n{}'.format(pformat(arguments)))

    """ Set random seed throughout python"""
    utils.set_random_seed(random_seed=random.randint(0, 1000))

    """ Set device - cpu or gpu """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logger.info(f'Using device - {device}')

    """ Load parameters for the Dataset """
    dataset = create_dataset(dataset_args, train_data_args, val_data_args)

    """ Sample and View the inputs to model """
    dataset.debug()

    dataloaders = [dataset.train_dataloader, dataset.validation_dataloader, dataset.test_dataloader]
    dataset_modes = ['train', 'validation', 'test']

    # Some datasets do not have same image size and thus we need to crop a certain area to compute attribution maps
    # Since we want to use same input image for retraining of model with different attribution maps, care must be taken
    # that transform applied gives same input image. Such as center crop is fine, but RandomCrop/RandomScale are not.
    save_input_images = True
    save_attribution = True

    assert save_input_images or save_attribution, 'Either save input images or save attribution flag should be enabled.'
    attribution_methods = cfg['extract_cams']['attribution_methods']
    logger.info(f'Computing attribution maps for {attribution_methods}')

    # Save attribution maps in ./outdir/[train, validation, test]/[input/AttributionName]/Class/ImageIndex.png
    for attribution_method in attribution_methods:
        for dataloader, dataset_mode in zip(dataloaders, dataset_modes):

            """ Load Model with weights(if available) """
            model: torch.nn.Module = model_utils.get_model(
                arguments.get('model_args'), device, arguments['dataset_args']
            ).to(device)

            """ Create cropped dataset and attributions directories """
            # Need to save cropped dataset once - Although its is not optimum to have another copy of a dataset, we
            # still preferred this due to simpler design of having parallel attribution and images dataset.
            if save_input_images:
                # The Classification Dataset(CIFAR/Birdsnap) are written in torchvision.datasets.ImageFolder format.
                images_output_dirs = [os.path.join(outdir, f'{dataset_mode}/input/', str(cls))
                                      for cls in dataset.classes]
                [os.makedirs(dir, exist_ok=True) for dir in images_output_dirs]

            counter = 0
            # Create labelled attribution folder
            attribution_output_dirs = [os.path.join(outdir, f'{dataset_mode}/{attribution_method["name"]}', str(cls))
                                       for cls in dataset.classes]
            [os.makedirs(dir, exist_ok=True) for dir in attribution_output_dirs]
            counters = defaultdict(int)

            """ Thank god, finally let CAM extraction begin """
            logger.info(f"Generating images and attribution for {attribution_method} in {dataset_mode} split.")
            model.eval()
            for i, data in enumerate(tqdm(dataloader)):
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                _, max_prob_indices = torch.max(outputs.data, 1)

                # ToDo: Add support for method that can compute attributions for batch
                # TODo - Check width and height
                for preprocessed_image, max_prob_index, label in zip(inputs, max_prob_indices, labels):
                    """ Save CAMS and input images."""
                    if save_input_images:
                        # Denormalize the image and save
                        rgb_image = dataset.denormalization_transform(preprocessed_image.cpu())
                        rgb_image = (torch.clamp(rgb_image, 0.0, 1.0).numpy() * 255.0).astype('uint8')
                        skimage.io.imsave(f'{images_output_dirs[label]}/'
                                          f'{str(counters[label.item()]).zfill(5)}.png',
                                          rgb_image.transpose(1, 2, 0),
                                          check_contrast=False)

                    if save_attribution:
                        attribution_map = attribution_loader.generate_attribution(
                            model,
                            preprocessed_image.unsqueeze(0),
                            max_prob_index,
                            attribution_method
                        )

                        # Save in attribution_output_dir as a uint8 image
                        percentiled_image = convert_float_to_percentiled_3channel_image(attribution_map)
                        skimage.io.imsave(f'{attribution_output_dirs[label]}/'
                                          f'{str(counters[label.item()]).zfill(5)}.png',
                                          percentiled_image.transpose(1, 2, 0),
                                          check_contrast=False)
                    counters[label.item()] += 1

        save_input_images = False  # No need to resave input images for next attribution method
コード例 #5
0
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import jsonlines
from pathlib import Path
import gzip
import src.utils.logger as logger
import re
import pandas as pd

log = logger.init("./log/create_corpus.log")
data_path = "./semanticscholar"

processed = pd.read_csv("./log/create_corpus.log", sep=' ', header=None)[9]
processed = [re.sub('semanticscholar/', '', obj) for obj in processed]


def doc_generator(reader):
    for doc in reader.iter(type=dict, skip_invalid=True):
        author_names = []
        author_ids = []
        for obj in doc.get('authors'):
            author_ids.extend(obj.get('ids'))
            author_names.append(obj.get('name'))

        yield {
            "_index": 'semanticscholar',
            "_type": "document",
            "_id": doc.get('id'),
            "title": doc.get('title'),
            "paperAbstract": doc.get("paperAbstract"),
            "entities": doc.get("entities"),