def InsertConvolution(predecessor_id, successor_id, old_model, kernel_size,
                      batch):
    """ 
    Function to insert a Conv-BatchNorm-Relu block

    Args:
        predecessor_id: previous layer
        successor_id: next layer
        old_model: model before mutation
        kernel_size: kernel size of the new block
        batch: first batch of the train loader

    Returns:
        Returns mutated model
    """
    new_model_descriptor = copy.deepcopy(old_model['model_descriptor'])
    old_pytorch_model = old_model['pytorch_model']

    successor = [
        layer for layer in new_model_descriptor['layers']
        if str(layer['id']) == str(successor_id)
    ][0]

    new_id_conv = utils.GetUnusedID(new_model_descriptor)
    new_id_bn = new_id_conv + 1
    new_id_acti = new_id_bn + 1

    old_pytorch_model.forward(batch)
    channels = old_pytorch_model.layerdic[str(predecessor_id)].size()[1]

    new_layer_conv = {
        'type': 'conv',
        'params': {
            'channels': channels,
            'ks1': kernel_size,
            'ks2': kernel_size,
            "in_channels": channels
        },
        'input': [predecessor_id],
        'id': new_id_conv
    }

    new_layer_bn = {
        'type': 'batchnorm',
        'params': {
            "in_channels": channels
        },
        'input': [new_id_conv],
        'id': new_id_bn
    }

    new_layer_acti = {
        'type': 'activation',
        'params': {},
        'input': [new_id_bn],
        'id': new_id_acti
    }

    utils.ReplaceInput([successor], predecessor_id, new_id_acti)

    new_model_descriptor['layers'].append(new_layer_conv)
    new_model_descriptor['layers'].append(new_layer_bn)
    new_model_descriptor['layers'].append(new_layer_acti)

    new_pytorch_model = ConvNet(new_model_descriptor)
    new_pytorch_model.cuda()
    new_pytorch_model._modules[str(new_id_bn)].momentum = 1.0
    new_pytorch_model._modules[str(new_id_bn)].eps = 0.0

    new_pytorch_model.forward(batch)

    new_pytorch_model = utils.InheritWeights(old_pytorch_model,
                                             new_pytorch_model)

    IDConv = conv2d_identity(channels, kernel_size)

    bias_shape = new_pytorch_model._modules[str(
        new_id_conv)].weight[1].size()[0]

    state_dict = {
        "weight": torch.from_numpy(IDConv).cuda(),
        "bias": torch.from_numpy(np.zeros(shape=bias_shape)).cuda()
    }
    new_pytorch_model._modules[str(new_id_conv)].load_state_dict(state_dict)

    # Batch Normalization layer's weight inheritance
    new_pytorch_model.forward(batch)

    predecessor_output_batch = new_pytorch_model.layerdic[str(new_id_conv)][0]
    predecessor_output_batch_cpu = predecessor_output_batch.cpu()
    predecessor_output_batch_data = predecessor_output_batch_cpu.data.numpy()

    batch_mean = np.mean(predecessor_output_batch_data, axis=(1, 2))

    batch_var = np.var(predecessor_output_batch_data, axis=(1, 2))

    eps = new_pytorch_model._modules[str(new_id_bn)].eps

    beta_ini = batch_mean

    rm_copy = copy.deepcopy(
        new_pytorch_model._modules[str(new_id_bn)].running_mean)
    rv_copy = copy.deepcopy(
        np.sqrt(new_pytorch_model._modules[str(new_id_bn)].running_var + eps))

    state_dict = {
        "weight": nn.Parameter(rv_copy.cuda()),
        "bias": nn.Parameter(rm_copy.cuda()),
        "running_var": torch.from_numpy(batch_var).cuda(),
        "running_mean": torch.from_numpy(beta_ini).cuda()
    }
    new_pytorch_model._modules[str(new_id_bn)].load_state_dict(state_dict)

    new_model = {
        'pytorch_model': new_pytorch_model,
        'model_descriptor': new_model_descriptor,
        'topo_ordering': new_pytorch_model.topo_ordering
    }

    return new_model
def MergeLayersConcatWithDownSampling(layer1_id, layer2_id,
                                      downsampling_factor, old_model, batch):
    """
    Function to merge layers by concatenation, from layer 1 to layer 2

    Args:
        layer1_id: layer 1
        layer2_id: layer 2
        downsampling_factor:  If downsampling_factor is greater than one, pooling needed
        old_model: model before mutation
        batch: first batch of the train loader

    Returns:
        Returns mutated model
    """

    new_model_descriptor = copy.deepcopy(old_model['model_descriptor'])
    old_pytorch_model = old_model['pytorch_model']

    [subsequentlayers, _,
     _] = utils.GetSubsequentLayers(int(layer2_id), new_model_descriptor)

    new_id = utils.GetUnusedID(new_model_descriptor)
    new_id_subseq = new_id + 1

    # If the downsampling factor not 1, we need to use pooling layer for dimention matching
    if downsampling_factor != 1:

        new_id_pool = new_id_subseq + 1

        # Check which layer is smaller
        old_pytorch_model.forward(batch)
        if old_pytorch_model.layerdic[str(layer1_id)].size(
        )[2] > old_pytorch_model.layerdic[str(layer2_id)].size()[2]:

            pool_layer = {
                'type': 'pool',
                'params': {
                    'poolsize': downsampling_factor,
                    'pooltype': 'max'
                },
                'id': new_id_pool,
                'input': [int(layer1_id)]
            }
            new_model_descriptor['layers'].append(pool_layer)

            merge_layer = {
                'type': 'merge',
                'params': {
                    'mergetype': 'concat'
                },
                'id': new_id,
                'input': [int(layer2_id), int(new_id_pool)]
            }
            new_model_descriptor['layers'].append(merge_layer)
        else:
            pool_layer = {
                'type': 'pool',
                'params': {
                    'poolsize': downsampling_factor,
                    'pooltype': 'max'
                },
                'id': new_id_pool,
                'input': [int(layer2_id)]
            }
            new_model_descriptor['layers'].append(pool_layer)

            merge_layer = {
                'type': 'merge',
                'params': {
                    'mergetype': 'concat'
                },
                'id': new_id,
                'input': [int(layer1_id), int(new_id_pool)]
            }
            new_model_descriptor['layers'].append(merge_layer)

    else:  # Downsampling factor is 1

        merge_layer = {
            'type': 'merge',
            'params': {
                'mergetype': 'concat'
            },
            'id': new_id,
            'input': [int(layer2_id), int(layer1_id)]
        }

        new_model_descriptor['layers'].append(merge_layer)

    old_id_subseq = subsequentlayers[0]['id']
    subsequentlayers[0]['id'] = new_id_subseq

    utils.ReplaceInput(subsequentlayers, int(layer2_id), new_id)

    # Update input for subsequent layers of subsequent conv layer
    subsubsequentlayers, _, _ = utils.GetSubsequentLayers(
        old_id_subseq, new_model_descriptor)

    # Change the next layers input with the new layer
    utils.ReplaceInput(subsubsequentlayers, old_id_subseq, new_id_subseq)

    # Replace in_channels for subsequent layer
    # We need the number of channels from layer2_id and layer1_id
    # We can use forward of old model to calculate the shape
    # Next layer's input and parameters need to be reshaped

    old_pytorch_model.forward(batch)
    parent_1_channels = old_pytorch_model.layerdic[str(layer1_id)].shape[1]
    parent_2_channels = old_pytorch_model.layerdic[str(layer2_id)].shape[1]

    subsequentlayers[0]['params']['in_channels'] = int(
        parent_1_channels) + int(parent_2_channels)

    new_pytorch_model = ConvNet(new_model_descriptor)
    new_pytorch_model.cuda()

    new_pytorch_model = utils.InheritWeights(old_pytorch_model,
                                             new_pytorch_model)

    try:
        new_pytorch_model.forward(batch)
    except:
        print("Problem with sizes MergeLayersConcatWithDownSampling")
        return old_model

    new_weights = copy.deepcopy(
        new_pytorch_model._modules[str(new_id_subseq)].weight)
    old_weights = copy.deepcopy(
        old_pytorch_model._modules[str(old_id_subseq)].weight)

    old_bias = copy.deepcopy(
        old_pytorch_model._modules[str(old_id_subseq)].bias)

    new_weights[:, 0:old_weights.shape[1], :, :] = old_weights
    new_weights[:, old_weights.shape[1]:, :, :] = torch.from_numpy(
        np.zeros(shape=new_weights[:, old_weights.shape[1]:, :, :].shape))

    state_dict = {
        "weight": nn.Parameter(new_weights.cuda()),
        "bias": nn.Parameter(old_bias.cuda())
    }
    new_pytorch_model._modules[str(new_id_subseq)].load_state_dict(state_dict)

    new_model = {
        'pytorch_model': new_pytorch_model,
        'model_descriptor': new_model_descriptor,
        'topo_ordering': new_pytorch_model.topo_ordering
    }

    return new_model
def SpecialChild(n_models, n_mutations, n_epochs_total, initial_model, savepath, folder_out):
    """
    generate and train children, update best model

    n_models = number of child models
    n_mutations = number of mutations/network operators to be applied per model_descriptor
    n_epochs_total = number of epochs for training in total
    initial model = current best model_descriptor
    savepath = where to save stuff
    folder_out = where to save the general files for one run
    """

    n_epochs_each = int(n_epochs_total)

    print('Train all models for', int(n_epochs_each), 'epochs.')

    init_weights_path = savepath + 'ini_weights'
    torch.save(initial_model['pytorch_model'].state_dict(), init_weights_path)

    performance = np.zeros(shape=(n_models,))
    descriptors = []

    for model_idx in range(0, n_models):
        print('\nmodel idx ' + str(model_idx))

        # save some data
        time_overall_s = time.time()

        pytorch_model = ConvNet(initial_model['model_descriptor'])
        pytorch_model.cuda()
        pytorch_model.load_state_dict(torch.load(init_weights_path), strict=False)

        model = {'pytorch_model': pytorch_model,
                 'model_descriptor': copy.deepcopy(initial_model['model_descriptor']),
                 'topo_ordering': pytorch_model.topo_ordering}
        descriptors.append(model['model_descriptor'])

        mutations_applied = []
        # overall , mutations, training
        times = [0, 0, 0]

        # apply operators
        for i in range(0, n_mutations):

            time_mut_s = time.time()

            # we don't mutate the first child!
            if model_idx != 0:

                mutations_probs = np.array([1, 1, 1, 1, 1, 1])
                [model, mutation_type, params] = network_operators.MutateNetwork(model, batch,
                                                                                 mutation_probs=mutations_probs)
                mutations_applied.append(mutation_type)
                time_mut_e = time.time()

                times[1] = times[1] + (time_mut_e - time_mut_s)

                pytorch_total_params = sum(p.numel() for p in model['pytorch_model'].parameters() if p.requires_grad)

                if pytorch_total_params > max_n_params:
                    break

        # train
        time_train_s = time.time()

        # initial short training of the children
        model['pytorch_model'].fit(trainloader, epochs=n_epochs_each)

        time_train_e = time.time()
        times[2] = times[2] + (time_train_e - time_train_s)

        performance[model_idx] = model['pytorch_model'].evaluate(validloader)

        pytorch_total_params_child = sum(p.numel() for p in model['pytorch_model'].parameters() if p.requires_grad)
        torch.save(model['pytorch_model'].state_dict(), savepath + 'model_' + str(model_idx))
        with open(folder_out + "performance.txt", "a+") as f_out:
            f_out.write('child ' + str(model_idx) + ' performance ' +str(performance[model_idx])+' num params '+str(pytorch_total_params_child) +'\n')

        descriptors[model_idx] = copy.deepcopy(model['model_descriptor'])

        time_overall_e = time.time()

        times[0] = times[0] + (time_overall_e - time_overall_s)

        np.savetxt(savepath + 'model_' + str(model_idx) + '_times', times)
        descriptor_file = open(savepath + 'model_' + str(model_idx) + '_model_descriptor.txt', 'w')

        for layer in model['model_descriptor']['layers']:
            layer_str = str(layer)
            descriptor_file.write(layer_str + "\n")
        descriptor_file.close()

        # delete the model (attempt to clean the memory)
        del model['pytorch_model']
        del model
        torch.cuda.empty_cache()

    # continue SH steps
    sorted_children = np.argsort(performance)

    n_children = len(sorted_children)
    n_epochs_train_children = n_epochs_each

    while n_children > 1:

        # pick the best halve of the children
        best_children = sorted_children[(n_children // 2):]
        # increase the training budget for them
        n_epochs_train_children = n_epochs_train_children * 2

        print("\nbest_children", best_children)
        print("n_epochs_train_children", n_epochs_train_children)

        for child in best_children:
            print("child ", child)

            # load the child parameters
            pytorch_model = ConvNet(descriptors[child])
            pytorch_model.cuda()
            pytorch_model.load_state_dict(torch.load(savepath + 'model_' + str(child)), strict=False)
            model = {'pytorch_model': pytorch_model,
                     'model_descriptor': copy.deepcopy(descriptors[child]),
                     'topo_ordering': pytorch_model.topo_ordering}

            # train a child
            model['pytorch_model'].fit(trainloader, epochs=n_epochs_train_children)

            # evaluate a child
            performance[child] = model['pytorch_model'].evaluate(validloader)
            pytorch_total_params_child = sum(p.numel() for p in model['pytorch_model'].parameters() if p.requires_grad)
            with open(folder_out + "performance.txt", "a+") as f_out:
                f_out.write('child ' + str(child) + ' performance ' +str(performance[child])+' num params '+str(pytorch_total_params_child) +'\n')

            # update a child model
            torch.save(model['pytorch_model'].state_dict(), savepath + 'model_' + str(child))

            # delete the model (attempt to clean the memory)
            del model['pytorch_model']
            del model
            torch.cuda.empty_cache()

        print("\nperformance ", performance)
        temp_children_array = np.argsort(performance)
        sorted_children = []

        for i, t in enumerate(temp_children_array):
            if t in best_children:
                sorted_children.append(t)

        print("sorted_children ", sorted_children)
        n_children = len(sorted_children)

    print("it should be the winner", sorted_children[0])
    print("it should be the best performance", performance[sorted_children[0]])

    # load the best child
    the_best_child = sorted_children[0]

    pytorch_model = ConvNet(descriptors[the_best_child])
    pytorch_model.cuda()
    pytorch_model.load_state_dict(torch.load(savepath + 'model_' + str(the_best_child)), strict=False)
    model = {'pytorch_model': pytorch_model,
             'model_descriptor': copy.deepcopy(descriptors[the_best_child]),
             'topo_ordering': pytorch_model.topo_ordering}

    with open(folder_out + "performance.txt", "a+") as f_out:
        f_out.write("****************************\n")

    return model, performance[sorted_children[0]]
def SplitConnection(layer2split_id, old_model, batch, split_factor):
    """
    Function to split up Conv-BatchNorm-Relu block
    Args:
        layer2split_id: id of layer to be split into two
        old_model: model before mutation
        batch: first batch of the train loader
        split_factor: splitting factor
    Returns:
        Returns mutated model
    """
    new_model_descriptor = copy.deepcopy(old_model['model_descriptor'])

    old_pytorch_model = old_model['pytorch_model']

    # Get BN and activation layer belonging to conv layer

    layer2split_bn = [
        layer for layer in new_model_descriptor['layers']
        if layer['input'] == [layer2split_id]
    ][0]

    layer2split_acti = [
        layer for layer in new_model_descriptor['layers']
        if layer['input'] == [layer2split_bn['id']]
    ][0]

    subsequentlayers = [
        layer for layer in new_model_descriptor['layers']
        if layer2split_acti['id'] in layer['input']
    ]

    layer2split = [
        layer for layer in new_model_descriptor['layers']
        if layer['id'] == layer2split_id
    ][0]

    old_id_conv = layer2split_id
    old_id_bn = layer2split_bn['id']
    old_id_acti = layer2split_acti['id']

    old_bn_layer = [
        layer for layer in new_model_descriptor['layers']
        if layer['id'] == old_id_bn
    ][0]

    assert ((layer2split['type'] == 'conv')
            or (layer2split['type']
                == 'sep')), 'Error: Layer hast to be conv or sep layer.'

    layer_type = layer2split['type']

    # 1st branch
    new_id_conv1 = utils.GetUnusedID(new_model_descriptor)
    new_id_bn1 = new_id_conv1 + 1
    new_id_acti1 = new_id_conv1 + 2

    # 2nd branch
    new_id_conv2 = new_id_conv1 + 3
    new_id_bn2 = new_id_conv1 + 4
    new_id_acti2 = new_id_conv1 + 5

    # sum up split
    new_id_add = new_id_conv1 + 6
    layer2split['id'] = new_id_conv1
    layer2split_bn['id'] = new_id_bn1
    layer2split_acti['id'] = new_id_acti1

    layer2split_bn['input'] = [new_id_conv1]
    layer2split_acti['input'] = [new_id_bn1]

    new_conv_layer = {
        'type': layer2split['type'],
        'params': copy.deepcopy(layer2split['params']),
        'id': new_id_conv2,
        'input': copy.deepcopy(layer2split['input'])
    }

    new_bn_layer = {
        'type': 'batchnorm',
        'params': copy.deepcopy(old_bn_layer['params']),
        'id': new_id_bn2,
        'input': [new_id_conv2]
    }

    new_acti_layer = {
        'type': 'activation',
        'params': {},
        'id': new_id_acti2,
        'input': [new_id_bn2]
    }

    new_merge_layer = {
        'type': 'merge',
        'params': {
            'mergetype': 'add'
        },
        'id': new_id_add,
        'input': [int(new_id_acti1), int(new_id_acti2)]
    }

    utils.ReplaceInput(subsequentlayers, old_id_acti, new_id_add)

    new_model_descriptor['layers'].append(new_conv_layer)
    new_model_descriptor['layers'].append(new_bn_layer)
    new_model_descriptor['layers'].append(new_acti_layer)
    new_model_descriptor['layers'].append(new_merge_layer)

    new_pytorch_model = ConvNet(new_model_descriptor)
    new_pytorch_model.cuda()

    new_pytorch_model = utils.InheritWeights(old_model['pytorch_model'],
                                             new_pytorch_model)

    new_pytorch_model.forward(batch)

    if layer_type == 'conv':
        old_weights_conv = copy.deepcopy(
            old_pytorch_model._modules[str(old_id_conv)].weight)
        old_bias_conv = copy.deepcopy(
            old_pytorch_model._modules[str(old_id_conv)].bias)

        # New_id_conv1
        state_dict = {
            "weight": nn.Parameter((split_factor * old_weights_conv).cuda()),
            "bias": nn.Parameter((split_factor * old_bias_conv).cuda())
        }

        new_pytorch_model._modules[str(new_id_conv1)].load_state_dict(
            state_dict)

        # New_id_conv2
        state_dict = {
            "weight": nn.Parameter(
                ((1 - split_factor) * old_weights_conv).cuda()),
            "bias": nn.Parameter(((1 - split_factor) * old_bias_conv).cuda())
        }

        new_pytorch_model._modules[str(new_id_conv2)].load_state_dict(
            state_dict)

    elif layer_type == 'sep':

        # Depthwise
        old_weights_conv = copy.deepcopy(
            old_pytorch_model._modules[str(old_id_conv)].depthwise.weight)
        old_bias_conv = copy.deepcopy(
            old_pytorch_model._modules[str(old_id_conv)].depthwise.bias)

        # New_id_conv1
        state_dict = {
            "weight": nn.Parameter((split_factor * old_weights_conv).cuda()),
            "bias": nn.Parameter((split_factor * old_bias_conv).cuda())
        }

        new_pytorch_model._modules[str(
            new_id_conv1)].depthwise.load_state_dict(state_dict)

        # New_id_conv2
        state_dict = {
            "weight": nn.Parameter(
                ((1 - split_factor) * old_weights_conv).cuda()),
            "bias": nn.Parameter(((1 - split_factor) * old_bias_conv).cuda())
        }

        new_pytorch_model._modules[str(
            new_id_conv2)].depthwise.load_state_dict(state_dict)

        # Pointwise
        old_weights_conv = copy.deepcopy(
            old_pytorch_model._modules[str(old_id_conv)].pointwise.weight)

        old_bias_conv = copy.deepcopy(
            old_pytorch_model._modules[str(old_id_conv)].pointwise.bias)

        # New_id_conv1
        state_dict = {
            "weight": nn.Parameter((split_factor * old_weights_conv).cuda()),
            "bias": nn.Parameter((split_factor * old_bias_conv).cuda())
        }

        new_pytorch_model._modules[str(
            new_id_conv1)].pointwise.load_state_dict(state_dict)

        # New_id_conv2
        state_dict = {
            "weight": nn.Parameter(
                ((1 - split_factor) * old_weights_conv).cuda()),
            "bias": nn.Parameter(((1 - split_factor) * old_bias_conv).cuda())
        }

        new_pytorch_model._modules[str(
            new_id_conv2)].pointwise.load_state_dict(state_dict)

    # Old_id_bn
    old_weights_bn = copy.deepcopy(
        old_pytorch_model._modules[str(old_id_bn)].weight)
    old_bias_bn = copy.deepcopy(
        old_pytorch_model._modules[str(old_id_bn)].bias)
    old_mean_bn = copy.deepcopy(
        old_pytorch_model._modules[str(old_id_bn)].running_mean)
    old_var_bn = copy.deepcopy(
        old_pytorch_model._modules[str(old_id_bn)].running_var)

    # New_id_bn1
    state_dict = {
        "weight": nn.Parameter((split_factor * old_weights_bn).cuda()),
        "bias": nn.Parameter((split_factor * old_bias_bn).cuda()),
        "running_var": nn.Parameter((split_factor * old_var_bn).cuda()),
        "running_mean": nn.Parameter((split_factor * old_mean_bn).cuda())
    }

    new_pytorch_model._modules[str(new_id_bn1)].load_state_dict(state_dict)

    # New_id_bn2
    state_dict = {
        "weight": nn.Parameter(((1 - split_factor) * old_weights_bn).cuda()),
        "bias": nn.Parameter(((1 - split_factor) * old_bias_bn).cuda()),
        "running_var": nn.Parameter(((1 - split_factor) * old_var_bn).cuda()),
        "running_mean": nn.Parameter(((1 - split_factor) * old_mean_bn).cuda())
    }

    new_pytorch_model._modules[str(new_id_bn2)].load_state_dict(state_dict)

    new_model = {
        'pytorch_model': new_pytorch_model,
        'model_descriptor': new_model_descriptor,
        'topo_ordering': new_pytorch_model.topo_ordering
    }

    return new_model
opt_algo = {'name': optim.SGD, 'lr': lr_vanilla, 'momentum': 0.9, 'weight_decay': 0.0005, 'alpha': 1.0}
sch_algo = {'name': optim.lr_scheduler.CosineAnnealingLR, 'T_max': 5, 'eta_min': 0, 'last_epoch': -1}
comp = {'optimizer': opt_algo, 'scheduler': sch_algo, 'loss': nn.CrossEntropyLoss, 'metrics': ['accuracy']}


model_descriptor = {}

model_descriptor['layers'] = [layer0, layer1, layer1_1, layer1_2,
                              layer4, layer5, layer5_1, layer5_2,
                              layer8, layer9, layer9_1, layer9_2, layer11]


model_descriptor['compile']= comp

# create a new basic model
mod = ConvNet(model_descriptor)
mod.cuda()

vanilla_model = {'pytorch_model': mod, 'model_descriptor': model_descriptor, 'topo_ordering': mod.topo_ordering}


# train initially our vanilla model and save
vanilla_model['pytorch_model'].fit_vanilla(trainloader, epochs=20)

# save vanilla model weights
torch.save(vanilla_model['pytorch_model'].state_dict(), expfolder + "vanilla_model")



def SpecialChild(n_models, n_mutations, n_epochs_total, initial_model, savepath, folder_out):
    """