Beispiel #1
0
 def __init__(self):
     self.config = SearchConfig()
     self.writer = None
     if self.config.tb_dir != "":
         from torch.utils.tensorboard import SummaryWriter
         self.writer = SummaryWriter(self.config.tb_dir, flush_secs=20)
     init_gpu_params(self.config)
     set_seed(self.config)
     self.logger = FileLogger('./log', self.config.is_master,
                              self.config.is_master)
     self.load_data()
     self.logger.info(self.config)
     self.model = SearchCNNController(self.config, self.n_classes,
                                      self.output_mode)
     self.load_model()
     self.init_kd_component()
     if self.config.n_gpu > 0:
         self.model.to(device)
     if self.config.n_gpu > 1:
         self.model = torch.nn.parallel.DistributedDataParallel(
             self.model,
             device_ids=[self.config.local_rank],
             find_unused_parameters=True)
     self.model_to_print = self.model if self.config.multi_gpu is False else self.model.module
     self.architect = Architect(self.model, self.teacher_model, self.config,
                                self.emd_tool)
     mb_params = param_size(self.model)
     self.logger.info("Model size = {:.3f} MB".format(mb_params))
     self.eval_result_map = []
     self.init_optim()
Beispiel #2
0
def run(args, myargs):
    my_config = getattr(myargs.config, args.command)
    config = SearchConfig()
    for k, v in args.items():
        assert not hasattr(config, k)
        setattr(config, k, v)

    for k, v in my_config.items():
        if not hasattr(config, k):
            print('* config does not have %s' % k)
        setattr(config, k, v)
    device = torch.device("cuda")
    writer = myargs.writer
    writer.add_text('all_config', config.as_markdown(), 0)
    logger = myargs.logger
    config.print_params(logger.info_msg)

    config.data_path = os.path.expanduser(config.data_path)
    config.plot_path = os.path.join(args.outdir, 'plot')
    config.path = args.outdir
    main(config=config, logger=logger, device=device, myargs=myargs)
Beispiel #3
0
def main():
    config = SearchConfig(section='fine-tune')

    device = torch.device("cuda")

    # tensorboard
    writer = SummaryWriter(log_dir=os.path.join(config.path, "tb"))
    writer.add_text('config', config.as_markdown(), 0)

    logger = utils.get_logger(
        os.path.join(config.path, "{}_tune.log".format(config.name)))
    config.print_params(logger.info)

    logger.info("Logger is set - training start")

    # set default gpu device id
    torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    torch.backends.cudnn.benchmark = True

    # get data with meta info
    input_size, input_channels, n_classes, train_data, valid_data = utils.get_data(
        config.dataset, config.data_path, cutout_length=0, validation=True)

    logger.debug('loading checkpoint')
    best_path = os.path.join(config.path, 'best.pth.tar')

    model = torch.load(best_path)

    model.prune()

    model = model.to(device)

    # weights optimizer
    w_optim = torch.optim.SGD(model.weights(),
                              config.w_lr,
                              momentum=config.w_momentum,
                              weight_decay=config.w_weight_decay)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=config.batch_size,
                                               shuffle=True,
                                               num_workers=config.workers,
                                               pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(valid_data,
                                               batch_size=config.batch_size,
                                               shuffle=False,
                                               num_workers=config.workers,
                                               pin_memory=True)

    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        w_optim, config.epochs, eta_min=config.w_lr_min)
    architect = Architect(model, config.w_momentum, config.w_weight_decay)
    model.print_alphas(logger)
    first_top1 = validate(valid_loader, model, -1, 0, device, config, logger,
                          writer)
    os.system('mkdir -p ' + config.fine_tune_path)
    # training loop
    best_top1 = 0.
    for epoch in range(config.epochs):
        lr_scheduler.step()
        lr = lr_scheduler.get_lr()[0]

        model.print_alphas(logger)

        # training
        train(train_loader, model, architect, w_optim, lr, epoch, writer,
              device, config, logger)

        # validation
        cur_step = (epoch + 1) * len(train_loader)
        top1 = validate(valid_loader, model, epoch, cur_step, device, config,
                        logger, writer)

        # save
        if best_top1 < top1:
            best_top1 = top1
            is_best = True
        else:
            is_best = False
        utils.save_checkpoint(model, config.fine_tune_path, is_best)
        print("")

    logger.info("Initial best Prec@1 = {:.4%}".format(first_top1))
    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
Beispiel #4
0
import torch.nn.functional as F
import torch.backends.cudnn as cudnn


def check_error(output, k_model, input_np, epsilon=1e-5):
    pytorch_output = output.data.numpy()
    keras_output = k_model.predict(input_np)

    error = np.max(pytorch_output - keras_output)
    print('Error:', error)

    assert error < epsilon
    return error


config = SearchConfig()

device = torch.device("cuda")

# tensorboard
writer = SummaryWriter(log_dir=os.path.join(config.path, "tb"))
writer.add_text('config', config.as_markdown(), 0)

logger = utils.get_logger(
    os.path.join(config.path, "{}.log".format(config.name)))
config.print_params(logger.info)


def main():
    logger.info("Logger is set - training start")
Beispiel #5
0
""" Search cell """
import os
import torch
import torch.nn as nn
import numpy as np
# from tensorboardX import SummaryWriter
from config import SearchConfig
import utils
from models.search_cnn import SearchCNNController
from architect import Architect
# from visualize import plot

config = SearchConfig()
print(config.gpus)
device = torch.device("cuda")

# tensorboard
# writer = SummaryWriter(log_dir=os.path.join(config.path, "tb"))
# writer.add_text('config', config.as_markdown(), 0)

logger = utils.get_logger(
    os.path.join(config.path, "{}.log".format(config.name)))
# config.print_params(logger.info)


def main():
    logger.info("Logger is set - training start")

    # set default gpu device id
    torch.cuda.set_device(config.gpus[0])
Beispiel #6
0
def main():
    config = SearchConfig()

    device = torch.device("cuda")

    # tensorboard
    tb_path = os.path.join(config.path, "tb")
    os.system('rm -r ' + tb_path)
    writer = SummaryWriter(log_dir=tb_path)
    writer.add_text('config', config.as_markdown(), 0)

    logger = utils.get_logger(
        os.path.join(config.path, "{}_train.log".format(config.name)))
    config.print_params(logger.info)

    logger.info("Logger is set - training start")
    if int(config.profile) != 0:
        logger.info('entering profile mode')
        profile = True
        config.epochs = 1
        max_batches = config.print_freq
    else:
        profile = False
        max_batches = None
    # set default gpu device id
    torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    torch.backends.cudnn.benchmark = True

    # get data with meta info
    input_size, input_channels, n_classes, train_data = utils.get_data(
        config.dataset, config.data_path, cutout_length=0, validation=False)

    module_name, class_name = config.controller_class.rsplit('.', 1)
    controller_cls = getattr(import_module(module_name), class_name)
    model = controller_cls(device, **config.__dict__)
    model = model.to(device)

    # weights optimizer
    w_optim = torch.optim.SGD(model.weights(),
                              config.w_lr,
                              momentum=config.w_momentum,
                              weight_decay=config.w_weight_decay)
    # alphas optimizer
    alpha_optim = torch.optim.Adam(model.alphas(),
                                   config.alpha_lr,
                                   betas=(0.5, 0.999),
                                   weight_decay=config.alpha_weight_decay)

    # split data to train/validation
    n_train = len(train_data)
    split = int(n_train * config.validate_split)
    indices = list(range(n_train))
    if split <= 0:
        logger.debug('using train as validation')
        valid_sampler = train_sampler = torch.utils.data.sampler.SubsetRandomSampler(
            indices)
    else:
        train_sampler = torch.utils.data.sampler.SubsetRandomSampler(
            indices[:split])
        valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(
            indices[split:])

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=config.batch_size,
                                               sampler=train_sampler,
                                               num_workers=config.workers,
                                               pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=config.batch_size,
                                               sampler=valid_sampler,
                                               num_workers=config.workers,
                                               pin_memory=True)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        w_optim, config.epochs, eta_min=config.w_lr_min)
    architect = Architect(model, config.w_momentum, config.w_weight_decay)

    # training loop
    best = 0
    best_genotype = None

    for epoch in range(config.epochs):
        lr_scheduler.step()
        lr = lr_scheduler.get_lr()[0]

        model.print_alphas(logger)

        # training
        if profile:
            with torch.autograd.profiler.profile(use_cuda=True) as prof:
                train_qual = train(train_loader,
                                   valid_loader,
                                   model,
                                   architect,
                                   w_optim,
                                   alpha_optim,
                                   lr,
                                   epoch,
                                   writer,
                                   device,
                                   config,
                                   logger,
                                   max_batches=max_batches)
            print('cpu')
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))
            print(prof.key_averages().table(sort_by="cpu_time", row_limit=10))
            print('cuda')
            print(prof.key_averages().table(sort_by="cuda_time_total",
                                            row_limit=10))
            print(prof.key_averages().table(sort_by="cuda_time", row_limit=10))
            break

        model.new_epoch(epoch, writer)
        train_qual = train(train_loader,
                           valid_loader,
                           model,
                           architect,
                           w_optim,
                           alpha_optim,
                           lr,
                           epoch,
                           writer,
                           device,
                           config,
                           logger,
                           max_batches=max_batches)

        # validation
        cur_step = (epoch + 1) * len(train_loader)
        val_qual = validate(valid_loader, model, epoch, cur_step, device,
                            config, logger, writer)

        # log
        # genotype
        genotype = model.genotype()
        logger.info("genotype = {}".format(genotype))

        # genotype as a image
        plot_path = os.path.join(config.plot_path,
                                 "EP{:02d}".format(epoch + 1))
        caption = "Epoch {}".format(epoch + 1)
        model.plot_genotype(plot_path, caption)
        #plot(genotype.normal, plot_path + "-normal", caption)
        #plot(genotype.reduce, plot_path + "-reduce", caption)

        if config.use_train_quality != 0:
            cur_qual = train_qual
        else:
            cur_qual = val_qual

        # save
        if best < cur_qual:
            best = cur_qual
            best_genotype = genotype
            is_best = True
        else:
            is_best = False
        utils.save_checkpoint(model, config.path, is_best)
        logger.info("Quality{}: {} \n\n".format('*' if is_best else '',
                                                cur_qual))

    logger.info("Final best =  {}".format(best))
    logger.info("Best Genotype = {}".format(best_genotype))
Beispiel #7
0
import os
import pickle
import sys
from tensorboardX import SummaryWriter
import time
import torch
import torch.nn as nn

from config import SearchConfig
from data_loader import load_dataset
import genotypes as gts
from search_cnn import SearchCNN
import utils


config = SearchConfig()
config.alpha_dir = os.path.join(config.stage_dir, "alphas")
os.system("mkdir -p {}".format(config.alpha_dir))

device = torch.device("cuda")

# tensorboard
writer = SummaryWriter(log_dir=config.log_dir)
writer.add_text("config", config.as_markdown(), 0)

logger = utils.get_logger(
    os.path.join(config.log_dir, "{}_{}.log".format(
        config.name, config.stage)))
config.print_args(logger.info)

Beispiel #8
0
import os
import random
import time

import numpy as np
import torch
import torch.nn as nn

import utils
from config import SearchConfig
from datasets.mld import get_search_datasets
from model import Model
from cdarts import CdartsTrainer

if __name__ == "__main__":
    config = SearchConfig()
    main_proc = not config.distributed or config.local_rank == 0
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=config.dist_url,
                                             rank=config.local_rank,
                                             world_size=config.world_size)
    if main_proc:
        os.makedirs(config.output_path, exist_ok=True)
    if config.distributed:
        torch.distributed.barrier()
    logger = utils.get_logger(os.path.join(config.output_path, 'search.log'))
    if main_proc:
        config.print_params(logger.info)
    utils.reset_seed(config.seed)
Beispiel #9
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import os
import sys
import time
import torch
import torch.nn as nn

from config import SearchConfig
from data_loader import load_dataset
import utils

config = SearchConfig()
config.model_dir = os.path.join(config.save_dir, "augment/models")

device = torch.device("cuda")

logger = utils.get_logger(
    os.path.join(config.log_dir, "{}_{}.log".format(config.name,
                                                    config.stage)))


def test(data_loader, model, criterion):
    loss = utils.AverageMeter()
    top1 = utils.AverageMeter()
    top5 = utils.AverageMeter()

    model.eval()
Beispiel #10
0
def main():
    # init config
    config = SearchConfig()

    # set seed
    if config.seed is not None:
        np.random.seed(config.seed)
        torch.manual_seed(config.seed)
        torch.cuda.manual_seed_all(config.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! ')

    # For slurm available
    if config.world_size == -1 and "SLURM_NPROCS" in os.environ:
        # acquire world size from slurm
        config.world_size = int(os.environ["SLURM_NPROCS"])
        config.rank = int(os.environ["SLURM_PROCID"])
        jobid = os.environ["SLURM_JOBID"]
        hostfile = os.path.join(config.dist_path, "dist_url." + jobid  + ".txt")
        if config.dist_file is not None:
            config.dist_url = "file://{}.{}".format(os.path.realpath(config.dist_file), jobid)
        elif config.rank == 0:
            if config.dist_backend == 'nccl' and config.infi_band:
                # only NCCL backend supports inifiniband
                interface_str = 'ib{:d}'.format(config.infi_band_interface)
                print("Use infiniband support on interface " + interface_str + '.')
                os.environ['NCCL_SOCKET_IFNAME'] = interface_str
                os.environ['GLOO_SOCKET_IFNAME'] = interface_str
                ip_str = os.popen('ip addr show ' + interface_str).read()
                ip = ip_str.split("inet ")[1].split("/")[0]
            else:
                if config.world_size == 1:  # use only one node
                    ip = '127.0.0.1'
                else:
                    ip = socket.gethostbyname(socket.gethostname())
            port = find_free_port()
            config.dist_url = "tcp://{}:{}".format(ip, port)
            with open(hostfile, "w") as f:
                f.write(config.dist_url)
        else:
            while not os.path.exists(hostfile):
                time.sleep(5)  # waite for the main process
            with open(hostfile, "r") as f:
                config.dist_url = f.read()
        print("dist-url:{} at PROCID {} / {}".format(config.dist_url, config.rank, config.world_size))

    # support multiple GPU on one node
    # assume each node have equal GPUs
    ngpus_per_node = torch.cuda.device_count()
    if config.mp_dist:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        config.world_size = ngpus_per_node * config.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # worker process function
        mp.spawn(worker, nprocs=ngpus_per_node, args=(ngpus_per_node, config))
    else:
        # Simply call worker function on first GPU device
        worker(None, ngpus_per_node, config)
Beispiel #11
0
from __future__ import division
from __future__ import print_function

import numpy as np
import os
import sys
import time
import torch
import torch.nn as nn

from config import SearchConfig
from data_loader import load_dataset
import feature_map as fmp
import utils

config = SearchConfig()
config.feature_dir = os.path.join(config.stage_dir, "features")
os.system("mkdir -p {}".format(config.feature_dir))
config.stage = "feature"
config.total_samples = 10000

device = torch.device("cuda")

logger = utils.get_logger(
    os.path.join(config.log_dir, "{}_{}.log".format(config.name,
                                                    config.stage)))


def compute_offline(data_loader, model, feature_dir):
    logger.info("computing offline...")
    save_dir = os.path.join(feature_dir, "offline")
Beispiel #12
0
def get_current_node_count():
    if "PAI_CURRENT_TASK_ROLE_NAME" not in os.environ:
        return 1
    task_role = os.environ["PAI_CURRENT_TASK_ROLE_NAME"]
    return int(os.environ["PAI_TASK_ROLE_TASK_COUNT_" + task_role])


def get_current_node_index():
    if "PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX" not in os.environ:
        return 0
    return int(os.environ["PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX"])


if __name__ == "__main__":
    config = SearchConfig()

    if config.nni:
        if config.nni == "gt_mock":
            nni_tools.mock_result()
        else:
            config.designated_subgraph = [nni_tools.get_param()]
            config.path = nni_tools.get_output_dir()

            # tensorboard
            writer = SummaryWriter(log_dir=os.path.join(config.path, "tb"))
            writer.add_text('config', config.as_markdown(), 0)

            logger = utils.get_logger(os.path.join(config.path, "{}.log".format(config.name)))
            config.print_params(logger.info)