def test_tie_breaking_off(self): """ Test k-winners with ties. Tie-breaking disabled. """ x = self.x2 # Force tie breaking x[0, 5] = x[0, 1] expected = torch.zeros_like(x) expected[0, 0] = x[0, 0] expected[0, 1] = x[0, 1] expected[0, 3] = x[0, 3] expected[0, 5] = x[0, 5] expected[1, 1] = x[1, 1] expected[1, 3] = x[1, 3] expected[1, 5] = x[1, 5] n = 6 kw = KWinners(n, percent_on=0.5, k_inference_factor=1.0, boost_strength=1.0, boost_strength_factor=0.5, duty_cycle_period=1000, break_ties=False) kw.duty_cycle[:] = self.duty_cycle2 result = kw(x) self.assertTrue(result.eq(expected).all())
def test_k_winners_module(self): x = self.x2 n = 6 kw = KWinners( n, percent_on=0.333, boost_strength=1.0, boost_strength_factor=0.5, duty_cycle_period=1000, ) kw.train() # Testing with mod.training = True # Expect 2 winners per batch (33% of 6) expected = torch.zeros_like(x) expected[0, 0] = 1.5 expected[0, 3] = 1.3 expected[1, 2] = 1.2 expected[1, 3] = 1.6 result = kw(x) self.assertEqual(result.shape, expected.shape) num_correct = (result == expected).sum() self.assertEqual(num_correct, result.reshape(-1).size()[0]) new_duty = torch.tensor([1.0, 0, 1.0, 2.000, 0, 0]) / 2.0 diff = (kw.duty_cycle - new_duty).abs().sum() self.assertLessEqual(diff, 0.001)
def __init__(self, input_size, output_size, hidden_size, num_segments, dim_context, sparsity, kw=False, relu=False, dendritic_layer_class=AbsoluteMaxGatingDendriticLayer): # The nonlinearity can either be k-Winners or ReLU, but not both assert not (kw and relu) super().__init__() self.num_segments = num_segments self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.dim_context = dim_context self.kw = kw self.relu = relu # Forward layers & k-winners self.dend1 = dendritic_layer_class(module=nn.Linear( input_size, hidden_size), num_segments=num_segments, dim_context=dim_context, module_sparsity=sparsity, dendrite_sparsity=sparsity) self.dend2 = dendritic_layer_class(module=nn.Linear( hidden_size, hidden_size), num_segments=num_segments, dim_context=dim_context, module_sparsity=sparsity, dendrite_sparsity=sparsity) if kw: self.kw1 = KWinners(n=hidden_size, percent_on=0.05, k_inference_factor=1.0, boost_strength=0.0, boost_strength_factor=0.0) self.kw2 = KWinners(n=hidden_size, percent_on=0.05, k_inference_factor=1.0, boost_strength=0.0, boost_strength_factor=0.0) if relu: self.relu1 = nn.ReLU() self.relu2 = nn.ReLU() # Final classifier layer self.classifier = SparseWeights(nn.Linear(hidden_size, output_size), sparsity=sparsity)
def __init__(self, input_size=28 * 28, n_hidden_units=1000, n_classes=10, is_sparse=False, sparsity=(0.75, 0.85), percent_on=0.1): """ Initialize a 2-layer MLP :param input_size: number of input features to the MLP :type input_size: int :param n_hidden_units: number of units in each of the two hidden layers :type n_hidden_units: int :param n_classes: number of output units :type n_classes: int :param is_sparse: whether or not to initialize the sparse network instead of a dense one :type is_sparse: bool :param sparsity: a 2-element list/tuple specifying the sparsity in each of the hidden layers :type sparsity: list/tuple of float :param percent_on: number of active units in the K-Winners layer (only applies to sparse networks) :type percent_on: float """ super().__init__() self.is_sparse = is_sparse self.flatten = Flatten() self.n_classes = n_classes self.fc1 = torch.nn.Linear(input_size, n_hidden_units) self.fc2 = torch.nn.Linear(n_hidden_units, n_hidden_units) self.fc3 = torch.nn.Linear(n_hidden_units, n_classes) if is_sparse: self.fc1_sparsity, self.fc2_sparsity = sparsity self.percent_on = percent_on self.fc1 = SparseWeights(self.fc1, sparsity=self.fc1_sparsity) self.kw1 = KWinners(n=n_hidden_units, percent_on=percent_on, boost_strength=0.0) self.fc2 = SparseWeights(self.fc2, sparsity=self.fc2_sparsity) self.kw2 = KWinners(n=n_hidden_units, percent_on=percent_on, boost_strength=0.0)
def _create_preprocess_module(self, module_type, preprocess_output_dim, kw_percent_on): if module_type is None: return None preprocess_module = nn.Sequential() linear_layer = SparseWeights(torch.nn.Linear( self.context_representation_dim + self.representation_dim, preprocess_output_dim, bias=True), sparsity=self.weight_sparsity, allow_extremes=True) DendriticMLP._init_sparse_weights(linear_layer, 0.0) if module_type == "relu": nonlinearity = nn.ReLU() else: nonlinearity = KWinners(n=preprocess_output_dim, percent_on=kw_percent_on, k_inference_factor=1.0, boost_strength=0.0, boost_strength_factor=0.0) preprocess_module.add_module("linear_layer", linear_layer) preprocess_module.add_module("nonlinearity", nonlinearity) self.context_representation_dim = preprocess_output_dim return preprocess_module
def _kwinners(self, fout): return KWinners( n=fout, percent_on=self.percent_on, boost_strength=self.boost_strength, boost_strength_factor=self.boost_strength_factor, )
def test_permuted_model_loading(self): model = torch.nn.Sequential( KWinners(8, percent_on=0.1), torch.nn.Linear(8, 8), ) param_map = { "0.weight": "1.weight", "0.bias": "1.bias", "1.boost_strength": "0.boost_strength", "1.duty_cycle": "0.duty_cycle", } model = load_multi_state( model, restore_linear=self.checkpoint_path, param_map=param_map, ) model = load_multi_state( model, restore_full_model=self.checkpoint_path, param_map=param_map, )
def _create_preprocess_module(self, module_type, preprocess_output_dim, kw_percent_on): preprocess_module = nn.Sequential() if module_type is None: return preprocess_module, self.context_dim linear_layer = torch.nn.Linear( self.context_dim, preprocess_output_dim, bias=True ) if module_type == "relu": nonlinearity = nn.ReLU() elif module_type == "kw": nonlinearity = KWinners( n=preprocess_output_dim, percent_on=kw_percent_on, k_inference_factor=1.0, boost_strength=0.0, boost_strength_factor=0.0 ) else: nonlinearity = nn.Identity() preprocess_module.add_module("linear_layer", linear_layer) preprocess_module.add_module("nonlinearity", nonlinearity) return preprocess_module, preprocess_output_dim
def __init__(self, input_size, hidden, sparsity, percent_on, boost_strength): super().__init__() self.sparse_linear = SparseWeights(nn.Linear(input_size, hidden), sparsity=sparsity) self.kw = KWinners(n=hidden, percent_on=percent_on, boost_strength=boost_strength)
def __init__(self, num_classes, input_shape): super().__init__() in_features = np.prod(input_shape) self.flatten = torch.nn.Flatten() self.kwinners = KWinners(in_features, percent_on=0.1) self.classifier = SparseWeights( nn.Linear(in_features, num_classes, bias=False), sparsity=0.9 )
def __init__(self, cnn_out_channels=(64, 64), cnn_percent_on=(0.095, 0.125), linear_units=1000, linear_percent_on=0.1, linear_weight_sparsity=0.4, boost_strength=1.5, boost_strength_factor=0.9, k_inference_factor=1.5, duty_cycle_period=1000): super(GSCSparseCNN, self).__init__( OrderedDict([ # First Sparse CNN layer ("cnn1", nn.Conv2d(1, cnn_out_channels[0], 5)), ("cnn1_batchnorm", nn.BatchNorm2d(cnn_out_channels[0], affine=False)), ("cnn1_maxpool", nn.MaxPool2d(2)), ("cnn1_kwinner", KWinners2d(channels=cnn_out_channels[0], percent_on=cnn_percent_on[0], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period)), # Second Sparse CNN layer ("cnn2", nn.Conv2d(cnn_out_channels[0], cnn_out_channels[1], 5)), ("cnn2_batchnorm", nn.BatchNorm2d(cnn_out_channels[1], affine=False)), ("cnn2_maxpool", nn.MaxPool2d(2)), ("cnn2_kwinner", KWinners2d(channels=cnn_out_channels[1], percent_on=cnn_percent_on[1], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period)), ("flatten", Flatten()), # Sparse Linear layer ("linear", SparseWeights(nn.Linear(25 * cnn_out_channels[1], linear_units), weight_sparsity=linear_weight_sparsity)), ("linear_bn", nn.BatchNorm1d(linear_units, affine=False)), ("linear_kwinner", KWinners(n=linear_units, percent_on=linear_percent_on, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period)), # Classifier ("output", nn.Linear(linear_units, 12)), ("softmax", nn.LogSoftmax(dim=1)) ]))
def add_sparse_linear_layer( network, suffix, input_size, linear_n, dropout, use_batch_norm, weight_sparsity, percent_on, k_inference_factor, boost_strength, boost_strength_factor, ): """Add sparse linear layer to network. :param network: The network to add the sparse layer to :param suffix: Layer suffix. Used to name its components :param input_size: Input size :param linear_n: Number of units :param dropout: dropout value :param use_batch_norm: whether or not to use batch norm :param weight_sparsity: Pct of weights that are allowed to be non-zero :param percent_on: Pct of ON (non-zero) units :param k_inference_factor: During inference we increase percent_on by this factor :param boost_strength: boost strength (0.0 implies no boosting) :param boost_strength_factor: boost strength is multiplied by this factor after each epoch """ linear = nn.Linear(input_size, linear_n) if 0 < weight_sparsity < 1.0: network.add_module( "linear{}".format(suffix), SparseWeights(linear, weight_sparsity) ) else: network.add_module("linear{}".format(suffix), linear) if use_batch_norm: network.add_module("linear_bn", nn.BatchNorm1d(linear_n, affine=False)) if dropout > 0.0: network.add_module("linear{}_dropout".format(suffix), nn.Dropout(dropout)) if 0 < percent_on < 1.0: network.add_module( "linear{}_kwinners".format(suffix), KWinners( n=linear_n, percent_on=percent_on, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, ), ) else: network.add_module("linear{}_relu".format(suffix), nn.ReLU())
def __init__(self, num_classes, input_shape): super().__init__() in_features = np.prod(input_shape) self.dendritic_gate = DendriticAbsoluteMaxGate1d() self.flatten = torch.nn.Flatten() self.kwinners = KWinners(n=16, percent_on=0.75, k_inference_factor=1) self.classifier = SparseWeights( torch.nn.Linear(in_features, num_classes, bias=False), sparsity=0.5, )
def __init__(self, cnn_out_channels=(32, 64), cnn_percent_on=(0.087, 0.293), linear_units=700, linear_percent_on=0.143, linear_weight_sparsity=0.3, boost_strength=1.5, boost_strength_factor=0.85, k_inference_factor=1.5, duty_cycle_period=1000): super(MNISTSparseCNN, self).__init__( OrderedDict([ # First Sparse CNN layer ("cnn1", nn.Conv2d(1, cnn_out_channels[0], 5)), ("cnn1_maxpool", nn.MaxPool2d(2)), ("cnn1_kwinner", KWinners2d(channels=cnn_out_channels[0], percent_on=cnn_percent_on[0], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period)), # Second Sparse CNN layer ("cnn2", nn.Conv2d(cnn_out_channels[0], cnn_out_channels[1], 5)), ("cnn2_maxpool", nn.MaxPool2d(2)), ("cnn2_kwinner", KWinners2d(channels=cnn_out_channels[1], percent_on=cnn_percent_on[1], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period)), ("flatten", Flatten()), # Sparse Linear layer ("linear", SparseWeights(nn.Linear(16 * cnn_out_channels[1], linear_units), weight_sparsity=linear_weight_sparsity)), ("linear_kwinner", KWinners(n=linear_units, percent_on=linear_percent_on, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period)), # Classifier ("output", nn.Linear(linear_units, 10)), ("softmax", nn.LogSoftmax(dim=1)) ]))
def test_kwinners_grad(self): n = 3 x = torch.tensor([[0, 2, 1], [14, 13, 12]], dtype=torch.float, requires_grad=True) grad = torch.tensor([[5, 6, 7], [45, 46, 47]], dtype=torch.float) expected = torch.tensor([[0, 6, 0], [45, 0, 0]], dtype=torch.float) for break_ties in [True, False]: with self.subTest(break_ties=break_ties): kw = KWinners(n, percent_on=(1 / 3), k_inference_factor=1.0, boost_strength=0.0, break_ties=break_ties) kw(x).backward(grad) torch.testing.assert_allclose(x.grad, expected) x.grad.zero_()
def __init__( self, input_size, output_size, kw_percent_on=0.05, boost_strength=0.0, weight_sparsity=0.95, duty_cycle_period=1000, ): super().__init__() self.linear = SparseWeights(nn.Linear(input_size, output_size), sparsity=weight_sparsity, allow_extremes=True) self.kw = KWinners(n=output_size, percent_on=kw_percent_on, boost_strength=boost_strength, duty_cycle_period=duty_cycle_period)
def test_kwinners_relu(self): n = 4 x = torch.tensor([[-5, -2, -1, 2], [-2, -1, 1, 2], [-4, -3, -2, -1]], dtype=torch.float) expected = torch.tensor([[0, 0, 0, 2], [0, 0, 1, 2], [0, 0, 0, 0]], dtype=torch.float) for break_ties in [True, False]: with self.subTest(break_ties=break_ties): kw = KWinners(n, percent_on=0.5, k_inference_factor=1.0, boost_strength=1.0, break_ties=break_ties, relu=True) result = kw(x) self.assertTrue(result.eq(expected).all())
def test_k_winners_module_two(self): """ Test a series of calls on the layer in training mode. """ # Set up test input and module. x = self.x2 n = 6 for break_ties in [True, False]: with self.subTest(break_ties=break_ties): expected = torch.zeros_like(x) expected[0, 0] = x[0, 0] expected[0, 5] = x[0, 5] expected[1, 2] = x[1, 2] expected[1, 3] = x[1, 3] kw = KWinners( n, percent_on=0.333, k_inference_factor=1.5, boost_strength=1.0, boost_strength_factor=0.5, duty_cycle_period=1000, break_ties=break_ties, ) kw.train(mode=True) result = kw(x) result = kw(x) result = kw(x) result = kw(x) result = kw(x) result = kw(x) result = kw(x) self.assertTrue(result.eq(expected).all()) # Test with mod.training = False. kw.train(mode=False) result = kw(x) expected = torch.zeros_like(x) expected[0, 0] = x[0, 0] expected[0, 1] = x[0, 1] expected[0, 5] = x[0, 5] expected[1, 2] = x[1, 2] expected[1, 3] = x[1, 3] expected[1, 4] = x[1, 4] self.assertTrue(result.eq(expected).all())
def add_sparse_dendrite_layer( network, suffix, in_dim, out_dim, dendrites_per_neuron, use_batch_norm=False, weight_sparsity=0.2, percent_on=0.1, k_inference_factor=1, boost_strength=1.5, boost_strength_factor=0.9, duty_cycle_period=1000, ): dendrite_layer = DendriteLayer( in_dim=in_dim, out_dim=out_dim, dendrites_per_neuron=dendrites_per_neuron, weight_sparsity=weight_sparsity, ) network.add_module("dendrites{}".format(suffix), dendrite_layer) if use_batch_norm: network.add_module("dendrites{}_bn".format(suffix), nn.BatchNorm1d(out_dim, affine=False)) network.add_module( "linear{}_kwinners".format(suffix), KWinners( n=out_dim, percent_on=percent_on, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period, ), )
def setUp(self): set_random_seed(20) self.model = torch.nn.Sequential( torch.nn.Linear(8, 8), KWinners(8, percent_on=0.1), ) # Create temporary results directory. self.tempdir = tempfile.TemporaryDirectory() self.results_dir = Path(self.tempdir.name) / Path("results") self.results_dir.mkdir() # Save model state. state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, self.model.state_dict(), compresslevel=-1) state["model"] = buffer.getvalue() self.checkpoint_path = self.results_dir / Path("mymodel") with open(self.checkpoint_path, "wb") as f: pickle.dump(state, f)
def __init__( self, input_size, output_size, hidden_sizes, num_segments, dim_context, kw, kw_percent_on=0.05, context_percent_on=1.0, dendrite_weight_sparsity=0.95, weight_sparsity=0.95, weight_init="modified", dendrite_init="modified", freeze_dendrites=False, output_nonlinearity=None, dendritic_layer_class=AbsoluteMaxGatingDendriticLayer, ): # Forward & dendritic weight initialization must be either "kaiming" or # "modified" assert weight_init in ("kaiming", "modified") assert dendrite_init in ("kaiming", "modified") assert kw_percent_on is None or (kw_percent_on >= 0.0 and kw_percent_on < 1.0) assert context_percent_on >= 0.0 if kw_percent_on == 0.0: kw = False super().__init__() if num_segments == 1: # use optimized 1 segment class dendritic_layer_class = OneSegmentDendriticLayer self.num_segments = num_segments self.input_size = input_size self.hidden_sizes = hidden_sizes self.output_size = output_size self.dim_context = dim_context self.kw = kw self.kw_percent_on = kw_percent_on self.weight_sparsity = weight_sparsity self.dendrite_weight_sparsity = dendrite_weight_sparsity self.output_nonlinearity = output_nonlinearity self.hardcode_dendrites = (dendrite_init == "hardcoded") self._layers = nn.ModuleList() self._activations = nn.ModuleList() if self.hardcode_dendrites: dendrite_sparsity = 0.0 else: dendrite_sparsity = self.dendrite_weight_sparsity for i in range(len(self.hidden_sizes)): curr_dend = dendritic_layer_class( module=nn.Linear(input_size, self.hidden_sizes[i], bias=True), num_segments=num_segments, dim_context=dim_context, module_sparsity=self.weight_sparsity, dendrite_sparsity=dendrite_sparsity, ) if weight_init == "modified": # Scale weights to be sampled from the new initialization U(-h, h) where # h = sqrt(1 / (weight_density * previous_layer_percent_on)) if i == 0: # first hidden layer can't have kw input self._init_sparse_weights(curr_dend, 0.0) else: self._init_sparse_weights(curr_dend, 1 - kw_percent_on if kw else 0.0) if dendrite_init == "modified": self._init_sparse_dendrites(curr_dend, 1 - context_percent_on) if freeze_dendrites: # Dendritic weights will not be updated during backward pass for name, param in curr_dend.named_parameters(): if "segments" in name: param.requires_grad = False if self.kw: curr_activation = KWinners(n=hidden_sizes[i], percent_on=kw_percent_on, k_inference_factor=1.0, boost_strength=0.0, boost_strength_factor=0.0) else: curr_activation = nn.ReLU() self._layers.append(curr_dend) self._activations.append(curr_activation) input_size = self.hidden_sizes[i] self._single_output_head = not isinstance(output_size, Iterable) if self._single_output_head: output_size = (output_size, ) self._output_layers = nn.ModuleList() for out_size in output_size: output_layer = nn.Sequential() output_linear = SparseWeights(module=nn.Linear( input_size, out_size), sparsity=weight_sparsity, allow_extremes=True) if weight_init == "modified": self._init_sparse_weights(output_linear, 1 - kw_percent_on if kw else 0.0) output_layer.add_module("output_linear", output_linear) if self.output_nonlinearity is not None: output_layer.add_module("non_linearity", output_nonlinearity) self._output_layers.append(output_layer)
def _kwinners(self, num_units): return KWinners(n=num_units, percent_on=0.25, boost_strength=1.4, boost_strength_factor=0.7)
def __init__(self, sparsify=False, percent_on=0.3, k_inference_factor=1.5, boost_strength=1.0, boost_strength_factor=0.9, duty_cycle_period=1000, num_classes=10, hidden_units=2048, hidden_layers=1, dropout=0.5, weight_sparsity=0.5, input_size=28 * 28, stats=False): """ Constructor for the object SimpleCNN Args: num_classes (int): total number of classes of the benchmark, i.e. maximum output neurons of the model. sparsify (bool): if we want to introduce the Kwinners and SparseWeights layers in the model. percent_on (float): Percentage of active units in fc layers. k_inference_factor (float): boosting parameter. Check the official Kwinners docs for further details. boost_strength (float): boosting parameter. boost_strength_factor (float): boosting parameter. hidden_units (int): number of units for the hidden layer. hidden_layers (int): number of hidden layers. dropout (float): dropout probability for each dropout layer. weight_sparsity (float): percentage of active weights for each fc layer. input_size (int): input size (assumed a linearized input). stats (bool): if we want to record sparsity statistics. """ super(SimpleMLP, self).__init__() self.active_perc_list = [] self.on_idxs = [0] * hidden_units self.hidden_units = hidden_units self.num_classes = num_classes self.stats = stats ft_modules = [] if sparsify: for i in range(hidden_layers): if i == 0: ft_modules.append( SparseWeights(nn.Linear(input_size, hidden_units), weight_sparsity=weight_sparsity)) else: ft_modules.append( SparseWeights(nn.Linear(hidden_units, hidden_units), weight_sparsity=weight_sparsity)) ft_modules.append( KWinners(hidden_units, percent_on, k_inference_factor, boost_strength, boost_strength_factor, duty_cycle_period)) ft_modules.append(nn.Dropout(dropout)) else: for i in range(hidden_layers): if i == 0: ft_modules.append(nn.Linear(input_size, hidden_units)) else: ft_modules.append(nn.Linear(hidden_units, hidden_units)) ft_modules.append(nn.ReLU(inplace=True)) ft_modules.append(nn.Dropout(dropout)) self.features = nn.Sequential(*ft_modules) self.classifier = nn.Linear(hidden_units, num_classes)
def __init__( self, input_size, context_size, output_size, hidden_sizes, layers_modulated, num_segments, kw_percent_on, context_percent_on, weight_sparsity, weight_init, dendrite_weight_sparsity, dendrite_init, dendritic_layer_class, output_nonlinearity, freeze_dendrites=False, ): super().__init__() self.input_size = input_size self.context_size = context_size self.output_size = output_size self.hidden_sizes = hidden_sizes self.layers_modulated = layers_modulated self.num_segments = num_segments self.kw_percent_on = kw_percent_on self.context_percent_on = context_percent_on self.weight_sparsity = weight_sparsity self.weight_init = weight_init self.dendrite_weight_sparsity = dendrite_weight_sparsity self.dendrite_init = dendrite_init self.output_nonlinearity = output_nonlinearity self.layers = nn.ModuleList() for i in range(len(self.hidden_sizes)): block_name = "" if i not in self.layers_modulated: linear = FFLayer( module=nn.Linear(input_size, self.hidden_sizes[i], bias=True), module_sparsity=self.weight_sparsity, ) block_name = "ff" else: linear = dendritic_layer_class( module=nn.Linear(input_size, self.hidden_sizes[i], bias=True), num_segments=self.num_segments, dim_context=self.context_size, module_sparsity=self.weight_sparsity, dendrite_sparsity=self.dendrite_weight_sparsity, ) block_name = "dendrite" if self.dendrite_init == "modified": self._init_sparse_dendrites(linear, 1 - self.context_percent_on) if freeze_dendrites: # Dendritic weights will not be updated during backward pass for name, param in linear.named_parameters(): if "segments" in name: param.requires_grad = False if self.weight_init == "modified": # Scale weights to be sampled from the new initialization U(-h, h) where # h = sqrt(1 / (weight_density * previous_layer_percent_on)) # first hidden layer can't have kw input if i == 0: self._init_sparse_weights(linear, 0.0) else: self._init_sparse_weights( linear, 1 - self.kw_percent_on if self.kw_percent_on else 0.0 ) if self.kw_percent_on: activation = KWinners(n=hidden_sizes[i], percent_on=kw_percent_on, k_inference_factor=1.0, boost_strength=0.0, boost_strength_factor=0.0) else: activation = nn.ReLU() block = SequentialBlock() block.add_module(block_name, SequentialBlock(linear, activation)) self.layers.append(block) input_size = self.hidden_sizes[i] if not isinstance(output_size, Iterable): output_size = (output_size,) self._output_layers = nn.ModuleList() for out_size in output_size: output_layer = nn.Sequential() output_linear = SparseWeights(module=nn.Linear(input_size, out_size), sparsity=self.weight_sparsity, allow_extremes=True) if self.weight_init == "modified": self._init_sparse_weights( output_linear, 1 - self.kw_percent_on if self.kw_percent_on else 0.0 ) output_layer.add_module("output_linear", output_linear) if self.output_nonlinearity is not None: output_layer.add_module("non_linearity", self.output_nonlinearity) self._output_layers.append(output_layer)
def test_k_winners_module_one(self): # Set up test input and module. x = self.x2 n = 6 for break_ties in [True, False]: with self.subTest(break_ties=break_ties): kw = KWinners( n, percent_on=0.333, k_inference_factor=1.5, boost_strength=1.0, boost_strength_factor=0.5, duty_cycle_period=1000, break_ties=break_ties, ) # Test with mod.training = False. kw.train(mode=False) # Expect 3 winners per batch (1.5 * 33% of 6 is 1 / 2 of 6) expected = torch.zeros_like(x) expected[0, 0] = x[0, 0] expected[0, 2] = x[0, 2] expected[0, 3] = x[0, 3] expected[1, 0] = x[1, 0] expected[1, 2] = x[1, 2] expected[1, 3] = x[1, 3] result = kw(x) self.assertEqual(result.shape, expected.shape) self.assertTrue(result.eq(expected).all()) # Run forward pass again while still not in training mode. # Should give the same result as the duty cycles are not updated. result = kw(x) self.assertEqual(result.shape, expected.shape) self.assertTrue(result.eq(expected).all()) # Test with mod.training = True kw.train(mode=True) # Expect 2 winners per batch (33% of 6) expected = torch.zeros_like(x) expected[0, 0] = x[0, 0] expected[0, 3] = x[0, 3] expected[1, 2] = x[1, 2] expected[1, 3] = x[1, 3] result = kw(x) self.assertEqual(result.shape, expected.shape) self.assertTrue(result.eq(expected).all()) # Test values of updated duty cycle. new_duty = torch.tensor([1.0, 0, 1.0, 2.0, 0, 0]) / 2.0 self.assertTrue(kw.duty_cycle.eq(new_duty).all()) # Test forward with updated duty cycle. result = kw(x) expected = torch.zeros_like(x) expected[0, 1] = x[0, 1] expected[0, 5] = x[0, 5] expected[1, 1] = x[1, 1] expected[1, 5] = x[1, 5] self.assertEqual(result.shape, expected.shape) self.assertTrue(result.eq(expected).all())
def _setup(self, config): # Get trial parameters seed = config["seed"] datadir = config["datadir"] batch_size = config["batch_size"] test_batch_size = config["test_batch_size"] first_epoch_batch_size = config["first_epoch_batch_size"] in_channels, h, w = config["c1_input_shape"] learning_rate = config["learning_rate"] momentum = config["momentum"] weight_sparsity = config["weight_sparsity"] boost_strength = config["boost_strength"] boost_strength_factor = config["boost_strength_factor"] n = config["n"] percent_on = config["percent_on"] cnn_percent_on = config["cnn_percent_on"] k_inference_factor = config["k_inference_factor"] kernel_size = config["kernel_size"] out_channels = config["out_channels"] output_size = config["output_size"] cnn_output_len = out_channels * ((w - kernel_size + 1) // 2)**2 torch.manual_seed(seed) if torch.cuda.is_available(): self.device = torch.device("cuda") torch.cuda.manual_seed(seed) else: self.device = torch.device("cpu") xforms = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) train_dataset = datasets.MNIST(datadir, train=True, transform=xforms) test_dataset = datasets.MNIST(datadir, train=False, transform=xforms) self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) self.test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=test_batch_size, shuffle=True) self.first_loader = torch.utils.data.DataLoader( train_dataset, batch_size=first_epoch_batch_size, shuffle=True) # Create simple sparse model self.model = nn.Sequential() # CNN layer self.model.add_module( "cnn", nn.Conv2d( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, ), ) if cnn_percent_on < 1.0: self.model.add_module( "kwinners_cnn", KWinners2d( percent_on=cnn_percent_on, channels=out_channels, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, ), ) else: self.model.add_module("ReLU_cnn", nn.ReLU()) self.model.add_module("maxpool", nn.MaxPool2d(kernel_size=2)) # Flatten max pool output before passing to linear layer self.model.add_module("flatten", Flatten()) # Linear layer linear = nn.Linear(cnn_output_len, n) if weight_sparsity < 1.0: self.model.add_module("sparse_linear", SparseWeights(linear, weight_sparsity)) else: self.model.add_module("linear", linear) if percent_on < 1.0: self.model.add_module( "kwinners_kinear", KWinners( n=n, percent_on=percent_on, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, ), ) else: self.model.add_module("Linear_ReLU", nn.ReLU()) # Output layer self.model.add_module("fc", nn.Linear(n, output_size)) self.model.add_module("softmax", nn.LogSoftmax(dim=1)) self.model.to(self.device) self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate, momentum=momentum)
def __init__(self, config=None): super().__init__() defaults = dict( device="cpu", input_size=784, num_classes=10, hidden_sizes=[100, 100, 100], percent_on_k_winner=[1.0, 1.0, 1.0], boost_strength=[1.4, 1.4, 1.4], boost_strength_factor=[0.7, 0.7, 0.7], batch_norm=False, dropout=False, bias=True, k_inference_factor=1.0, ) assert ( config is None or "use_kwinners" not in config ), "use_kwinners is deprecated" defaults.update(config or {}) self.__dict__.update(defaults) self.device = torch.device(self.device) # decide which actiovation function to use self.activation_funcs = [] for layer, hidden_size in enumerate(self.hidden_sizes): if self.percent_on_k_winner[layer] < 0.5: self.activation_funcs.append( KWinners( hidden_size, percent_on=self.percent_on_k_winner[layer], boost_strength=self.boost_strength[layer], boost_strength_factor=self.boost_strength_factor[layer], k_inference_factor=self.k_inference_factor, ) ) else: self.activation_funcs.append(nn.ReLU()) # Construct layers. layers = [] kwargs = dict(bias=self.bias, batch_norm=self.batch_norm, dropout=self.dropout) # Flatten image. layers = [nn.Flatten()] # Add the first layer layers.append( DSLinearBlock( self.input_size, self.hidden_sizes[0], activation_func=self.activation_funcs[0], config=config, **kwargs, ) ) # Add hidden layers. for i in range(1, len(self.hidden_sizes)): layers.append( DSLinearBlock( self.hidden_sizes[i - 1], self.hidden_sizes[i], activation_func=self.activation_funcs[i], config=config, **kwargs, ) ) # Add last layer. layers.append( DSLinearBlock( self.hidden_sizes[-1], self.num_classes, bias=self.bias, config=config ) ) # Create the classifier. self.dynamic_sparse_modules = [layer[0] for layer in layers[1:]] self.classifier = nn.Sequential(*layers) # Initialize attr to decide whether to update coactivations during learning. self._track_coactivations = False # Off by default.
def __init__(self, cnn_out_channels=(64, 64), cnn_percent_on=(0.095, 0.125), cnn_weight_sparsity=(0.5, 0.2), linear_units=1000, linear_percent_on=0.1, linear_weight_sparsity=0.1, boost_strength=1.5, boost_strength_factor=0.9, k_inference_factor=1.0, duty_cycle_period=1000, kwinner_local=False): super(GSCSparseCNN, self).__init__() # input_shape = (1, 32, 32) # First Sparse CNN layer if cnn_weight_sparsity[0] < 1.0: self.add_module( "cnn1", SparseWeights2d(nn.Conv2d(1, cnn_out_channels[0], 5), weight_sparsity=cnn_weight_sparsity[0])) else: self.add_module("cnn1", nn.Conv2d(1, cnn_out_channels[0], 5)) self.add_module("cnn1_batchnorm", nn.BatchNorm2d(cnn_out_channels[0], affine=False)) self.add_module( "cnn1_kwinner", KWinners2d( channels=cnn_out_channels[0], percent_on=cnn_percent_on[0], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period, local=kwinner_local, )) self.add_module("cnn1_maxpool", nn.MaxPool2d(2)) # Second Sparse CNN layer if cnn_weight_sparsity[1] < 1.0: self.add_module( "cnn2", SparseWeights2d(nn.Conv2d(cnn_out_channels[0], cnn_out_channels[1], 5), weight_sparsity=cnn_weight_sparsity[1])) else: self.add_module( "cnn2", nn.Conv2d(cnn_out_channels[0], cnn_out_channels[1], 5)) self.add_module("cnn2_batchnorm", nn.BatchNorm2d(cnn_out_channels[1], affine=False)) self.add_module( "cnn2_kwinner", KWinners2d( channels=cnn_out_channels[1], percent_on=cnn_percent_on[1], k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period, local=kwinner_local, )) self.add_module("cnn2_maxpool", nn.MaxPool2d(2)) self.add_module("flatten", Flatten()) # Sparse Linear layer self.add_module( "linear", SparseWeights(nn.Linear(25 * cnn_out_channels[1], linear_units), weight_sparsity=linear_weight_sparsity)) self.add_module("linear_bn", nn.BatchNorm1d(linear_units, affine=False)) self.add_module( "linear_kwinner", KWinners(n=linear_units, percent_on=linear_percent_on, k_inference_factor=k_inference_factor, boost_strength=boost_strength, boost_strength_factor=boost_strength_factor, duty_cycle_period=duty_cycle_period)) # Classifier self.add_module("output", nn.Linear(linear_units, 12)) self.add_module("softmax", nn.LogSoftmax(dim=1))
def _create_vgg_model(self): """ block_sizes = [1,1,1] - number of CNN layers in each block cnn_out_channels = [c1, c2, c3] - # out_channels in each layer of this block cnn_kernel_size = [k1, k2, k3] - kernel_size in each layer of this block cnn_weight_sparsity = [w1, w2, w3] - weight sparsity of each layer of this block cnn_percent_on = [p1, p2, p3] - percent_on in each layer of this block """ # Here we require exactly 3 blocks # assert(len(self.block_sizes) == 3) # Create simple CNN model, with options for sparsity self.model = nn.Sequential() in_channels = 3 output_size = 32 * 32 output_units = output_size * in_channels for ly, block_size in enumerate(self.block_sizes): for b in range(block_size): self._add_cnn_layer( index_str=str(ly) + "_" + str(b), in_channels=in_channels, out_channels=self.cnn_out_channels[ly], kernel_size=self.cnn_kernel_sizes[ly], percent_on=self.cnn_percent_on[ly], weight_sparsity=self.cnn_weight_sparsity[ly], add_pooling=b == block_size - 1, ) in_channels = self.cnn_out_channels[ly] output_size = int(output_size / 4) output_units = output_size * in_channels # Flatten CNN output before passing to linear layer self.model.add_module("flatten", Flatten()) # Linear layer input_size = output_units for ly, linear_n in enumerate(self.linear_n): linear = nn.Linear(input_size, linear_n) if self.linear_weight_sparsity[ly] < 1.0: self.model.add_module( "linear_" + str(ly), SparseWeights(linear, self.linear_weight_sparsity[ly]), ) else: self.model.add_module("linear_" + str(ly), linear) if self.linear_percent_on[ly] < 1.0: self.model.add_module( "kwinners_linear_" + str(ly), KWinners( n=linear_n, percent_on=self.linear_percent_on[ly], k_inference_factor=self.k_inference_factor, boost_strength=self.boost_strength, boost_strength_factor=self.boost_strength_factor, ), ) else: self.model.add_module("Linear_ReLU_" + str(ly), nn.ReLU()) input_size = self.linear_n[ly] # Output layer self.model.add_module("output", nn.Linear(input_size, self.output_size)) print(self.model) self.model.to(self.device) self._initialize_weights()
def __init__(self, config=None): super().__init__() defaults = dict( device="cpu", input_size=1024, num_classes=12, boost_strength=[1.5, 1.5, 1.5], boost_strength_factor=[0.9, 0.9, 0.9], duty_cycle_period=1000, k_inference_factor=1.5, percent_on_k_winner=[0.095, 0.125, 0.1], hidden_neurons_conv=[64, 64], hidden_neurons_fc=1000, batch_norm=True, dropout=False, bias=True, ) defaults.update(config or {}) self.__dict__.update(defaults) self.device = torch.device(self.device) kwargs = dict(bias=self.bias, batch_norm=self.batch_norm, dropout=self.dropout) # decide which actiovation function to use for conv self.activation_funcs = [] for layer, hidden_size in enumerate(self.hidden_neurons_conv): if self.percent_on_k_winner[layer] < 0.5: self.activation_funcs.append( KWinners2d( hidden_size, percent_on=self.percent_on_k_winner[layer], boost_strength=self.boost_strength[layer], boost_strength_factor=self.boost_strength_factor[layer], k_inference_factor=self.k_inference_factor, ) ) else: self.activation_funcs.append(nn.ReLU()) # decide which activvation to use for linear if self.percent_on_k_winner[-1] < 0.5: linear_activation = KWinners( self.hidden_neurons_fc, percent_on=self.percent_on_k_winner[-1], boost_strength=self.boost_strength[-1], boost_strength_factor=self.boost_strength_factor[-1], k_inference_factor=self.k_inference_factor, ) else: linear_activation = nn.ReLU() # linear layers conv_layers = [ # 28x28 -> 14x14 *self._conv_block(1, self.hidden_neurons_conv[0], self.activation_funcs[0]), # 10x10 -> 5x5 *self._conv_block( self.hidden_neurons_conv[0], self.hidden_neurons_conv[1], self.activation_funcs[1], ), Flatten(), ] linear_layers = [ DSLinearBlock( self.hidden_neurons_conv[1] * 25, self.hidden_neurons_fc, activation_func=linear_activation, batch_norm_affine=False, config=config, **kwargs, ), DSLinearBlock(self.hidden_neurons_fc, self.num_classes, config=config), ] self.features = nn.Sequential(*conv_layers) self.classifier = nn.Sequential(*linear_layers)