Example #1
0
def debug():
    # design to debug the encoding scheme
    seed = 0
    np.random.seed(seed)
    budget = 2000
    B, n_ops, n_cell = 5, 7, 2
    networks = []
    design_id = 1
    while len(networks) < budget:
        bit_string = []
        for c in range(n_cell):
            for b in range(B):
                bit_string += [np.random.randint(n_ops),
                               np.random.randint(b + 2),
                               np.random.randint(n_ops),
                               np.random.randint(b + 2)
                               ]

        genome = convert(bit_string)
        # check against evaluated networks in case of duplicates
        doTrain = True
        for network in networks:
            if compare(genome, network):
                doTrain = False
                break

        if doTrain:
            genotype = decode(genome)
            model = Network(16, 10, 8, False, genotype)
            model.drop_path_prob = 0.0
            data = torch.randn(1, 3, 32, 32)
            output, output_aux = model(torch.autograd.Variable(data))
            networks.append(genome)
            design_id += 1
            print(design_id)
Example #2
0
def main(macro_genome, micro_genome, epochs, search_space='micro',
         save='Design_1', expr_root='search', seed=0, gpu=0, init_channels=24,
         layers=11, auxiliary=False, cutout=False, drop_path_prob=0.0, batch_size=128):

    # ---- train logger ----------------- #
    save_pth = os.path.join(expr_root, '{}'.format(save))
    utils.create_exp_dir(save_pth)
    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout, level=logging.INFO,
                        format=log_format, datefmt='%m/%d %I:%M:%S %p')

    # ---- parameter values setting ----- #
    CIFAR_CLASSES = config_dict()['n_classes']
    INPUT_CHANNELS = config_dict()['n_channels']
    learning_rate = 0.025
    momentum = 0.9
    weight_decay = 3e-4
    data_root = '../data'
    cutout_length = 16
    auxiliary_weight = 0.4
    grad_clip = 5
    report_freq = 50
    train_params = {
        'auxiliary': auxiliary,
        'auxiliary_weight': auxiliary_weight,
        'grad_clip': grad_clip,
        'report_freq': report_freq,
    }

    if search_space == 'micro' or search_space == 'micro_garbage':
        genome = micro_genome
        genotype = micro_encoding.decode(genome)
        model = Network(init_channels, CIFAR_CLASSES, config_dict()['n_channels'], layers, auxiliary, genotype)
    elif search_space == 'macro' or search_space == 'macro_garbage':
        genome = macro_genome
        genotype = macro_encoding.decode(genome)
        channels = [(INPUT_CHANNELS, init_channels),
                    (init_channels, 2*init_channels),
                    (2*init_channels, 4*init_channels)]
        model = EvoNetwork(genotype, channels, CIFAR_CLASSES, (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual')
    elif search_space == 'micromacro':
        genome = [macro_genome, micro_genome]
        macro_genotype = macro_encoding.decode(macro_genome)
        micro_genotype = micro_encoding.decode(micro_genome)
        genotype = [macro_genotype, micro_genotype]
        set_config('micro_creator', make_micro_creator(micro_genotype, convert=False))
        channels = [(INPUT_CHANNELS, init_channels),
                    (init_channels, 2 * init_channels),
                    (2 * init_channels, 4 * init_channels)]
        model = EvoNetwork(macro_genotype, channels, CIFAR_CLASSES,
                           (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual')

    else:
        raise NameError('Unknown search space type')

    # logging.info("Genome = %s", genome)
    logging.info("Architecture = %s", genotype)

    torch.cuda.set_device(gpu)
    cudnn.benchmark = True
    torch.manual_seed(seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(seed)

    n_params = (np.sum(np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6)
    model = model.to(device)

    logging.info("param size = %fMB", n_params)

    if config_dict()['problem'] == 'classification':
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.MSELoss()
    criterion = criterion.cuda()


    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.SGD(
        parameters,
        learning_rate,
        momentum=momentum,
        weight_decay=weight_decay
    )

    CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
    CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]

    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor()
    ])

    if cutout:
        train_transform.transforms.append(utils.Cutout(cutout_length))

    train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD))

    valid_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
    ])

    train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=False, transform=train_transform)
    valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=False, transform=valid_transform)

    train_queue = torch.utils.data.DataLoader(
        train_data, batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True, num_workers=1)

    valid_queue = torch.utils.data.DataLoader(
        valid_data, batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
        pin_memory=True, num_workers=1)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, int(epochs))

    for epoch in range(epochs):
        scheduler.step()
        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
        model.droprate = drop_path_prob * epoch / epochs

        train_acc, train_obj = train(train_queue, model, criterion, optimizer, train_params)
        logging.info(f'train_{config_dict()["performance_measure"]} %f', train_acc)

    valid_acc, valid_obj = infer(valid_queue, model, criterion)
    logging.info(f'valid_{config_dict()["performance_measure"]} %f', valid_acc)

    # calculate for flops
    model = add_flops_counting_methods(model)
    model.eval()
    model.start_flops_count()
    random_data = torch.randn(1, INPUT_CHANNELS, config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH'])
    model(torch.autograd.Variable(random_data).to(device))
    n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4)
    logging.info('flops = %f', n_flops)

    # save to file
    # os.remove(os.path.join(save_pth, 'log.txt'))
    with open(os.path.join(save_pth, 'log.txt'), "w") as file:
        file.write("Genome = {}\n".format(genome))
        file.write("Architecture = {}\n".format(genotype))
        file.write("param size = {}MB\n".format(n_params))
        file.write("flops = {}MB\n".format(n_flops))
        file.write("valid_acc = {}\n".format(valid_acc))

    # logging.info("Architecture = %s", genotype))

    return {
        'valid_acc': valid_acc,
        'params': n_params,
        'flops': n_flops,
    }
Example #3
0
def inherit_one_model(individual,
                      expr_root: str,
                      model=None,
                      args=None) -> nn.Module:
    """
    Very complicated function.
        Handles inheritance of the common components not defined in the genome of the individual.
        Also calls the function to inherit weights for the cells, weight are defined by the individual.

    TODO: Maybe document this better
    TODO: Improve Error Logging

    Args:
        individual: the individual to inherit weights
        expr_root: path as defined in args.save
        model: not used except in testing
        args: not used except in testing

    Returns:
        model
    """

    try:

        r = np.random.uniform(-0.5, 1.5)

        parent1 = projectcode.weightmanagement.common.read_parent_by_id(
            individual.parents[0], expr_root, args)
        parent2 = projectcode.weightmanagement.common.read_parent_by_id(
            individual.parents[1], expr_root, args)

        parents = common.determine_more_fit_parent(parent1, parent2)

        genotype = micro_encoding.decode(micro_encoding.convert(individual.X))
        if model is None:
            CIFAR_CLASSES = 10
            auxiliary = False
            model = Network(args.init_channels, CIFAR_CLASSES, args.layers,
                            auxiliary, genotype)
            model = common.initialize_zero(model)

        wcom = WeightComputer(parents)

        # weight merge
        model.stem[0].weight = wcom.compute_child_weight(
            "stem.0.weight",
            r,
            inherit_rules="both",
            weight_tensor=model.stem[0].weight)

        previous_reduction = False
        pp_reduction = False
        for cell_number, cell in enumerate(model.cells):
            try:

                reduction = cell_number in [
                    len(model.cells) // 3,
                    2 * len(model.cells) // 3,
                ]
                # print(reduction, previous_reduction)
                model = inherit_one_cell(
                    cell_number,
                    individual,
                    model,
                    parents,
                    reduce=reduction,
                    previous_reduce=previous_reduction,
                    weight_computer=wcom,
                    pp_reduce=pp_reduction,
                    r=r,
                )
                pp_reduction = previous_reduction
                previous_reduction = reduction

            except:
                logger.warning("error in cell %i" % cell_number)
                raise

        inherit = None
        child_genome = common.decode_individual(individual)
        if parents[0]["genome"].normal_concat == parents[1][
                "genome"].normal_concat:
            inherit = "both"
        elif parents[0]["genome"].normal_concat == child_genome.normal_concat:
            inherit = "first"
        elif parents[1]["genome"].normal_concat == child_genome.normal_concat:
            inherit = "second"
        else:
            inherit = "concat_mismatch"
        assert inherit is not None, "could not determine classifier inheritance"

        key = "classifier.weight"
        model.classifier.weight = wcom.compute_child_weight(
            key, r, inherit, model.classifier.weight)
        key = "classifier.bias"
        model.classifier.bias = wcom.compute_child_weight(
            key, r, inherit, model.classifier.bias)

        common.assert_non_null_weights(model.state_dict())
    except:

        logger.warning(
            projectcode.weightmanagement.common.decode_individual(individual))
        logger.warning(parents[0]["genome"])
        logger.warning(parents[1]["genome"])
        logger.warning(individual.parents)
        raise

    return model
Example #4
0
def main(genome,
         epochs,
         search_space='micro',
         save='Design_1',
         expr_root='search',
         seed=0,
         gpu=0,
         init_channels=24,
         layers=11,
         auxiliary=False,
         cutout=False,
         drop_path_prob=0.0,
         train_dataset="",
         val_dataset=""):

    # ---- train logger ----------------- #
    save_pth = os.path.join(expr_root, '{}'.format(save))
    utils.create_exp_dir(save_pth)
    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p')
    # fh = logging.FileHandler(os.path.join(save_pth, 'log.txt'))
    # fh.setFormatter(logging.Formatter(log_format))
    # logging.getLogger().addHandler(fh)

    # ---- parameter values setting ----- #
    NUM_CLASSES = 4
    CIFAR_CLASSES = NUM_CLASSES
    DATA_SHAPE = (128, 128)
    INPUT_CHANNELS = 3
    learning_rate = 0.025
    momentum = 0.9
    weight_decay = 3e-4
    data_root = '../data'
    batch_size = 16
    cutout_length = 16
    auxiliary_weight = 0.4
    grad_clip = 5
    report_freq = 50
    train_params = {
        'auxiliary': auxiliary,
        'auxiliary_weight': auxiliary_weight,
        'grad_clip': grad_clip,
        'report_freq': report_freq,
    }

    if search_space == 'micro':
        genotype = micro_encoding.decode(genome)
        model = Network(init_channels, CIFAR_CLASSES, layers, auxiliary,
                        genotype)
    elif search_space == 'macro':
        genotype = macro_encoding.decode(genome)
        channels = [(INPUT_CHANNELS, init_channels),
                    (init_channels, 2 * init_channels),
                    (2 * init_channels, 4 * init_channels)]
        model = EvoNetwork(genotype,
                           channels,
                           CIFAR_CLASSES,
                           DATA_SHAPE,
                           decoder='residual')
    else:
        raise NameError('Unknown search space type')

    # logging.info("Genome = %s", genome)
    logging.info("Architecture = %s", genotype)

    torch.cuda.set_device(gpu)
    cudnn.benchmark = True
    torch.manual_seed(seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(seed)

    n_params = (np.sum(
        np.prod(v.size())
        for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6)
    model = model.to(device)

    logging.info("param size = %fMB", n_params)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.SGD(parameters,
                                learning_rate,
                                momentum=momentum,
                                weight_decay=weight_decay)

    #TODO: change
    CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
    DATASET_MEAN = [0.4785047, 0.45649716, 0.42604172]
    CIFAR_MEAN = DATASET_MEAN
    DATASET_STD = [0.31962952, 0.3112294, 0.31206125]
    CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
    CIFAR_STD = DATASET_STD
    #     # data agumentation
    #     train_transform = transforms.Compose([
    #         transforms.RandomCrop(32, padding=4),
    #         transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor()
    #     ])

    #     if cutout:
    #         train_transform.transforms.append(utils.Cutout(cutout_length))

    #     train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD))

    #     valid_transform = transforms.Compose([
    #         transforms.ToTensor(),
    #         transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
    #     ])

    #     train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=True, transform=train_transform)
    #     valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=True, transform=valid_transform)

    #     # num_train = len(train_data)
    #     # indices = list(range(num_train))
    #     # split = int(np.floor(train_portion * num_train))
    train_data = train_dataset
    valid_data = val_dataset
    train_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True,
        num_workers=4)

    valid_queue = torch.utils.data.DataLoader(
        valid_data,
        batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
        pin_memory=True,
        num_workers=4)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, int(epochs))

    for epoch in range(epochs):
        scheduler.step()
        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
        model.droprate = drop_path_prob * epoch / epochs

        train_acc, train_obj = train(train_queue, model, criterion, optimizer,
                                     train_params)
        logging.info('train_acc %f', train_acc)

    valid_acc, valid_obj = infer(valid_queue, model, criterion)
    logging.info('valid_acc %f', valid_acc)

    # calculate for flops
    model = add_flops_counting_methods(model)
    model.eval()
    model.start_flops_count()
    random_data = torch.randn(1, INPUT_CHANNELS, *DATA_SHAPE)
    model(torch.autograd.Variable(random_data).to(device))
    n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4)
    logging.info('flops = %f', n_flops)

    # save to file
    # os.remove(os.path.join(save_pth, 'log.txt'))
    with open(os.path.join(save_pth, 'log.txt'), "w") as file:
        file.write("Genome = {}\n".format(genome))
        file.write("Architecture = {}\n".format(genotype))
        file.write("param size = {}MB\n".format(n_params))
        file.write("flops = {}MB\n".format(n_flops))
        file.write("valid_acc = {}\n".format(valid_acc))

    # logging.info("Architecture = %s", genotype))

    return {
        'valid_acc': valid_acc,
        'params': n_params,
        'flops': n_flops,
    }
Example #5
0
def train_and_evaluate(
    genome: tuple,
    individual=None,
    args: argparse.Namespace = None,
    first_gen: bool = True,
    save: str = None,
    client_id: str = None,
):
    """
    Function to train and evaluate an individual using a TPU.

    Results are always saved in the save dir to make distributed data management easier.

    Args:
        first_gen:
        genome:
        save:
        individual:
        args:

    Returns:

    """

    if args.stream == "tpu":
        # must warp up TPU
        import torch_xla

    auxiliary = False

    assert hasattr(individual, "id")

    if not first_gen:
        # this is not the first generation, so mating should have occurred
        assert hasattr(individual, "parents")

    expr_root = ""

    save_pth = os.path.join(expr_root, "{}".format(save))
    utils.create_exp_dir(save_pth)

    CIFAR_CLASSES = 10
    learning_rate = 0.025
    momentum = 0.9
    weight_decay = 3e-4
    data_root = "../data"
    batch_size = args.batch_size
    auxiliary_weight = 0.4
    grad_clip = 5
    report_freq = 50
    train_params = {
        "auxiliary": auxiliary,
        "auxiliary_weight": auxiliary_weight,
        "grad_clip": grad_clip,
        "report_freq": report_freq,
    }

    if args.search_space == "micro":
        genotype = micro_encoding.decode(genome)
        model = Network(args.init_channels, CIFAR_CLASSES, args.layers,
                        auxiliary, genotype)

        if not first_gen:
            # change the way the weights are set up
            model = manage_weights(model, individual, expr_root, args)

    elif args.search_space == "macro":
        raise NotImplementedError("Not supported")
    else:
        raise NameError("Unknown search space type")

    logger.info("Architecture = %s", genotype)

    try:
        max_weight = args.max_weight
    except:
        print("Could Not Determine Maximum Weight Argument")
        max_weight = 1e20

    clip = weightClip(max_weight=max_weight, min_weight=max_weight * -1)

    if args.stream == "tpu":
        from projectcode.training.tpu import get_map_fn
        import torch_xla.distributed.xla_multiprocessing as xmp

        WRAPPED_MODEL = xmp.MpModelWrapper(model)

        logger.info("Executing TPU Training")
        map_fn = get_map_fn(model,
                            train_params,
                            data_root,
                            momentum,
                            weight_decay,
                            CIFAR_CLASSES,
                            learning_rate,
                            args.layers,
                            batch_size,
                            epochs=args.epochs,
                            save_pth=save_pth,
                            args=args,
                            WRAPPED_MODEL=WRAPPED_MODEL,
                            clip=clip)

        FLAGS = {}

        xmp.spawn(map_fn, args=(FLAGS, ), nprocs=1, start_method="fork")

        valid_acc, n_flops = torch.load("results.pt")
    elif args.stream == "gpu":
        from projectcode.training.gpu import train_gpu
        logger.info("Executing GPU Training")
        valid_acc, n_flops = train_gpu(model,
                                       train_params,
                                       data_root,
                                       momentum,
                                       weight_decay,
                                       CIFAR_CLASSES,
                                       learning_rate,
                                       args.layers,
                                       batch_size,
                                       epochs=args.epochs,
                                       save_pth=save_pth,
                                       args=args,
                                       clip=clip)

    else:

        raise NameError("Unrecognized client stream")

    n_params = (np.sum(
        np.prod(v.size())
        for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6)

    if main_config.distributed_cloud and args.weight_init == "lammarckian":
        wt_path = f"{args.code}_{client_id}_weights_{individual.id:05d}.pt"
        torch.save(model.state_dict(), wt_path)
        blob_name = upload_blob(wt_path)
    else:
        blob_name = None
        torch.save(model.state_dict(), os.path.join(save_pth, "weights.pt"))

    result_dict = {
        "id": individual.id,
        "save_path": save_pth,
        "valid_acc": valid_acc,
        "params": n_params,
        "flops": n_flops,
        "wt_blob_name": blob_name,
    }

    dump(result_dict, os.path.join(save_pth, "result.pkl"))

    return result_dict