コード例 #1
0
def main():
    args = config.arguments()
    logger = log_utils.LogManager(args)
    net = model_utils.load(args.model_name)
    if args.cuda:
        torch.cuda.set_device(args.cuda_devide)

    torch.manual_seed(args.random_seed)
    if args.cuda:
        torch.cuda.manual_seed(args.random_seed)
        net = net.cuda()

    args.state = 'train'
    train_dataloader = data_utils.get_dataloader(args)
    args.state = 'test'
    test_dataloader = data_utils.get_dataloader(args)

    print('begin training')
    train.train(net, train_dataloader, logger, args)
    print('end training\n\n')

    print('begin eval')
    eval.eval(net, test_dataloader, logger, args)
    print('end eval')
コード例 #2
0
def main():
    """Main process."""

    # Settings
    args = parse_args()
    check_args(args)
    use_gpu = args.use_gpu and torch.cuda.is_available()
    torch.manual_seed(args.seed)
    local_rank = args.local_rank  # Local identifier of the current node
    device = torch.device('cpu')
    if use_gpu:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.deterministic = True  # Make the results exactly
        torch.backends.cudnn.benchmark = False  # the same under the same seed
        device = local_rank
        torch.cuda.set_device(device)
    dist.init_process_group(backend=args.backend, init_method=args.init_method)
    rank = dist.get_rank()  # Unique identifier among all processes

    # Prepare data
    socket.setdefaulttimeout(60)  # Connection time limit for data downloading
    transform = transforms.Compose([transforms.ToTensor(), ])
    if local_rank == 0:
        train_dataset = datasets.MNIST(
            args.data_path, train=True, download=True, transform=transform)
    dist.barrier()  # Wait for local_rank 0 to finish downloading the dataset
    if local_rank != 0:
        train_dataset = datasets.MNIST(
            args.data_path, train=True, transform=transform)
    test_dataset = datasets.MNIST(
        args.data_path, train=False, transform=transform)
    train_sampler = torch.utils.data.DistributedSampler(
        train_dataset, shuffle=True)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.train_batch_size, shuffle=False,
        num_workers=1, pin_memory=use_gpu, sampler=train_sampler)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=args.test_batch_size, shuffle=False,
        num_workers=1, pin_memory=use_gpu)

    # Build the model
    model = Net()

    # Pretrained weights should be loaded before using auto_decomposition
    if args.pretrained_path is not None:
        pretrained_path = os.path.realpath(args.pretrained_path)
        state_dict = torch.load(pretrained_path, map_location='cpu')
        model.load_state_dict(state_dict)
        if rank == 0:
            print('Loaded pretrained weights file: {}'.format(pretrained_path))

    # Use tensor decomposition here, after building the model, before DDP and
    # before passing model parameters to the optimizer
    if args.tensor_decompose:
        dec_info_path = args.decompose_info_path
        dec_weights_path = os.path.realpath(args.decomposed_weights_path)
        if args.run_mode == 'online':
            if local_rank == 0:
                # Decompose the model on local_rank 0, and save the
                # decomposition information file. It will take some time.
                # Pretrained weights should have been loaded to the model.
                model, _ = auto_decomposition(model, dec_info_path)

                # Save the decomposed weights
                os.makedirs(os.path.dirname(dec_weights_path), exist_ok=True)
                torch.save(model.state_dict(), dec_weights_path)
                print('Decomposed weights file is saved to: {}'.format(
                    dec_weights_path))

            # Wait until all local_ranks get here. This makes the other
            # local_ranks wait for local_rank 0 to complete, in order to use
            # the decomposition information and weights saved by local_rank 0.
            dist.barrier()

            if local_rank != 0:
                # Decompose the model with the saved decomposition information
                model, _ = decompose_network(model, dec_info_path)

                # Load the saved weights
                state_dict = torch.load(dec_weights_path, map_location='cpu')
                model.load_state_dict(state_dict)

        else:  # offline
            # Use existing decomposition information and decomposed weights for
            # all local_ranks
            model, _ = decompose_network(model, dec_info_path)
            state_dict = torch.load(dec_weights_path, map_location='cpu')
            model.load_state_dict(state_dict)
            if rank == 0:
                print('Loaded decomposed weights file: {}'.format(
                    dec_weights_path))

    # Put the model on target device
    model = model.to(device)

    # DDP
    device_ids = [local_rank] if use_gpu else []
    model = DDP(model, device_ids=device_ids)

    # Build optimizer
    # Note: please do not load optimizer parameters of the original model after
    # tensor decomposition, since the model parameters have been changed.
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # Train and test
    train(model, device, train_loader,
          optimizer, args.steps, args.log_steps, rank == 0)
    if rank == 0:
        test(model, device, test_loader)

    # Save model weights
    if rank == 0 and args.save_path is not None:
        save_path = os.path.realpath(args.save_path)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        torch.save(model.module.state_dict(), save_path)
        print('Trained weights file is saved to: {}'.format(save_path))
コード例 #3
0
def train_and_evaluate(model,
                       optimizer,
                       train_loader,
                       val_loader,
                       loss_fn,
                       metrics,
                       params,
                       run_dir,
                       device,
                       scheduler=None,
                       restore_file=None,
                       writer=None):
    """
    Train the model and evaluate on every epoch

    Args:
        model: (inherits torch.nn.Module) the custom neural network model
        optimizer: (inherits torch.optim) optimizer to update the model parameters
        train_loader: (DataLoader) a torch.utils.data.DataLoader object that fetches
                      the training set
        val_loader: (DataLoader) a torch.utils.data.DataLoader object that fetches
                    the validation set
        loss_fn : (function) a function that takes batch_output (tensor) and batch_labels
                  (np.ndarray) and return the loss (tensor) over the batch
        metrics: (dict) a dictionary of functions that compute a metric using the
                 batch_output and batch_labels
        params: (Params) hyperparameters
        run_dir: (string) directory containing params.json, learned weights, and logs
        restore_file: (string) optional = name of file to restore training from -> no
                      filename extension .pth or .pth.tar/gz
        writer: (tensorboard) tensorboard summary writer
        device: (str) device type; usually 'cuda:0' or 'cpu'

    """

    # reload the weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(run_dir, restore_file + '.pth.zip')
        if os.path.exists(restore_path):
            logging.info("Restoring weights from {}".format(restore_path))
            load_checkpoint(restore_path, model, optimizer)

    best_val_accu = 0.0

    for epoch in range(params.num_epochs):

        # running one epoch
        logging.info("Epoch {} / {}".format(epoch + 1, params.num_epochs))

        # logging current learning rate
        for i, param_group in enumerate(optimizer.param_groups):
            logging.info("learning rate = {} for parameter group {}".format(
                param_group['lr'], i))

        # train for one full pass over the training set
        train_metrics, batch_summ = train(model, optimizer, loss_fn, train_loader, \
            metrics, params, epoch, device, writer)

        # evaluate for one epoch on the validation set
        val_metrics = evaluate(model, loss_fn, val_loader, metrics, params,
                               device)

        # schedule learning rate
        if scheduler is not None:
            scheduler.step()

        # check if current epoch has best accuracy
        val_accu = val_metrics['accuracy']
        is_best = val_accu >= best_val_accu

        # save weights
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=run_dir)

        # save batch summaries
        save_batch_summary(run_dir, batch_summ)

        # if best accuray
        if is_best:
            logging.info(
                "- Found new best accuray model at epoch {}".format(epoch + 1))
            best_val_accu = val_accu

        # add training log to tensorboard
        if writer is not None:

            # train and validation per-epoch mean metrics
            for metric, value in train_metrics.items():
                if metric in val_metrics.keys():
                    writer.add_scalars(metric, {
                        'train': value,
                        'val': val_metrics[metric]
                    }, epoch)

            # layer weights / gradients distributions
            for idx, m in enumerate(model.modules()):
                if isinstance(m, (nn.Conv2d, nn.Linear)):
                    if m.weight is not None:
                        writer.add_histogram('layer{}.weight'.format(idx),
                                             m.weight, epoch)
                    if m.weight.grad is not None:
                        writer.add_histogram('layer{}.weight.grad'.format(idx), \
                            m.weight.grad, epoch)
コード例 #4
0
def main():
    """Main process."""

    # Settings
    args = parse_args()
    check_args(args)
    use_gpu = args.use_gpu and torch.cuda.is_available()
    torch.manual_seed(args.seed)
    device = torch.device('cpu')
    if use_gpu:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.deterministic = True  # Make the results exactly
        torch.backends.cudnn.benchmark = False  # the same under the same seed
        device = torch.device('cuda')

    # Prepare data
    socket.setdefaulttimeout(60)  # Connection time limit for data downloading
    transform = transforms.Compose([transforms.ToTensor(), ])
    train_dataset = datasets.MNIST(
        args.data_path, train=True, download=True, transform=transform)
    test_dataset = datasets.MNIST(
        args.data_path, train=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.train_batch_size,
        shuffle=True, num_workers=1, pin_memory=use_gpu)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=args.test_batch_size,
        shuffle=False, num_workers=1, pin_memory=use_gpu)

    # Build the model
    model = Net()

    # Pretrained weights should be loaded before using auto_decomposition
    if args.pretrained_path is not None:
        pretrained_path = os.path.realpath(args.pretrained_path)
        state_dict = torch.load(pretrained_path, map_location='cpu')
        model.load_state_dict(state_dict)
        print('Loaded pretrained weights: {}'.format(pretrained_path))

    # Use tensor decomposition here, after building the model and before
    # passing model parameters to the optimizer
    if args.tensor_decompose:
        dec_info_path = args.decompose_info_path
        dec_weights_path = args.decomposed_weights_path

        if args.run_mode == 'online':
            # Decompose the model, and save the decomposition information file
            # (if needed). It will take some time.
            # Pretrained weights should have been loaded to the model.
            model, _ = auto_decomposition(model, dec_info_path)

            # Save the decomposed weights (if needed)
            if dec_weights_path is not None:
                dec_weights_path = os.path.realpath(dec_weights_path)
                os.makedirs(os.path.dirname(dec_weights_path), exist_ok=True)
                torch.save(model.state_dict(), dec_weights_path)
                print('Decomposed weights file is saved to: {}'.format(
                    dec_weights_path))

        else:  # offline
            # Use existing decomposition information and decomposed weights
            model, _ = decompose_network(model, dec_info_path)
            dec_weights_path = os.path.realpath(dec_weights_path)
            state_dict = torch.load(dec_weights_path, map_location='cpu')
            model.load_state_dict(state_dict)
            print('Loaded decomposed weights file: {}'.format(
                dec_weights_path))

    # Put the model on target device
    model = model.to(device)

    # Build optimizer
    # Note: please do not load optimizer parameters of the original model after
    # tensor decomposition, since the model parameters have been changed.
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # Train and test
    train(model, device, train_loader,
          optimizer, args.steps, args.log_steps, True)
    test(model, device, test_loader)

    # Save model weights
    if args.save_path is not None:
        save_path = os.path.realpath(args.save_path)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        torch.save(model.state_dict(), save_path)
        print('Trained weights file is saved to: {}'.format(save_path))
コード例 #5
0
ファイル: main.py プロジェクト: mengqi-coder/centerloss
from common.train import train
from Reader import Reader
import tensorflow as tf

if __name__ == '__main__':
    reader = Reader(
        data_dir='/home/give/Documents/dataset/cifar/cifar-100-python',
        batch_size=100,
        reshape_flag=True)
    category_num = 100
    image_tensor = tf.placeholder(dtype=tf.float32,
                                  shape=[100, 32, 32, 3],
                                  name='input_x')
    label_tensor = tf.placeholder(dtype=tf.float32,
                                  shape=[100, category_num],
                                  name='input_y')
    restore_obj = dict()
    restore_obj[
        'path'] = '/home/give/PycharmProjects/Reproduce/CenterLoss/MNIST/parameters'
    train(image_tensor,
          label_tensor,
          int(1e6),
          reader,
          restore=None,
          output_num=category_num)