def __init__(self, g, k, s, c, h_g, h_l, std, hidden_size, num_classes): """ Initialize the recurrent attention model and its different components. Args ---- - g: size of the square patches in the glimpses extracted by the retina. - k: number of patches to extract per glimpse. - s: scaling factor that controls the size of successive patches. - c: number of channels in each image. - h_g: hidden layer size of the fc layer for `phi`. - h_l: hidden layer size of the fc layer for `l`. - std: standard deviation of the Gaussian policy. - hidden_size: hidden size of the rnn. - num_classes: number of classes in the dataset. - num_glimpses: number of glimpses to take per image, i.e. number of BPTT steps. """ super(RecurrentAttention, self).__init__() self.std = std self.sensor = glimpse_network(h_g, h_l, g, k, s, c) self.rnn = core_network(hidden_size, hidden_size) self.locator = location_network(hidden_size, 2, std) self.classifier = action_network(hidden_size, num_classes) self.baseliner = baseline_network(hidden_size, 1)
def __init__(self, g, h_g, h_l, std, hidden_size, num_classes): """ Initialize the recurrent attention model and its different components. Args ---- - g: size of the square patches in the glimpses extracted by the retina. - h_g: hidden layer size of the fc layer for 'what' representation - h_l: hidden layer size of the fc layer for 'where' representation - std: standard deviation of the Gaussian policy. - hidden_size: hidden size of the LSTM - num_classes: number of classes in the dataset. - num_glimpses: number of glimpses to take per image, i.e. number of BPTT steps. """ super(RecurrentAttention, self).__init__() self.std = std self.ret = retina(g, k, s) self.sensor = glimpse_3d(h_g, h_l, g, k, s, c) self.rnn = core_network(hidden_size, hidden_size) self.locator = location_network(hidden_size, 3, std) self.classifier = action_network(hidden_size, num_classes) self.baseliner = baseline_network(hidden_size, 1) self.context = context_network_clin(hidden_size)
def main(): # load images imgs = [] paths = [data_dir + './lenna.jpg', data_dir + './cat.jpg'] for i in range(len(paths)): img = img2array(paths[i], desired_size=[512, 512], expand=True) imgs.append(torch.from_numpy(img)) imgs = torch.cat(imgs) B, H, W, C = imgs.shape loc = torch.Tensor([[-1., 1.], [-1., 1.]]) imgs, loc = Variable(imgs), Variable(loc) sensor = glimpse_network(h_g=128, h_l=128, g=64, k=3, s=2, c=3) g_t = sensor(imgs, loc) rnn = core_network(input_size=256, hidden_size=256) h_t = Variable(torch.zeros(g_t.shape[0], 256)) h_t = rnn(g_t, h_t) classifier = action_network(256, 10) a_t = classifier(h_t) loc_net = location_network(256, 2, 0.11) mu, l_t = loc_net(h_t) base = baseline_network(256, 1) b_t = base(h_t) print("g_t: {}".format(g_t.shape)) print("h_t: {}".format(h_t.shape)) print("l_t: {}".format(l_t.shape)) print("a_t: {}".format(a_t.shape)) print("b_t: {}".format(b_t.shape))
def __init__(self, g, k, s, c, h_g, h_l, std, hidden_size, num_classes): """ Initialize the recurrent attention model and its different components. Args ---- - g: size of the square patches in the glimpses extracted by the retina. - k: number of patches to extract per glimpse. - s: scaling factor that controls the size of successive patches. - c: number of channels in each image. - h_g: hidden layer size of the fc layer for `phi`. - h_l: hidden layer size of the fc layer for `l`. - std: standard deviation of the Gaussian policy. - hidden_size: hidden size of the rnn. - num_classes: number of classes in the dataset. - num_glimpses: number of glimpses to take per image, i.e. number of BPTT steps. """ super(RecurrentAttention, self).__init__() self.std = std # feature extraction on x at location l_t_prev, and combine the information # of the image patches and their locations self.sensor = glimpse_network(h_g, h_l, g, k, s, c) # combine the information of current patch_info g_t and the hidden info from the last step h_t self.rnn = core_network(hidden_size, hidden_size) # Uses the internal state `h_t` of the core network to produce # the location coordinates `l_t` for the next time step. # only take the new h_t as input, without the old l_t_prev self.locator = location_network(hidden_size, 2, std) self.classifier = action_network(hidden_size, num_classes) self.baseliner = baseline_network(hidden_size, 1)
def __init__(self, g, k, s, c, h_g, h_l, std, hidden_size, num_classes, kernel_size, num_stacks, stack_attn_mode): """ Initialize the recurrent attention model and its different components. Args ---- - g: size of the square patches in the glimpses extracted by the retina. - k: number of patches to extract per glimpse. - s: scaling factor that controls the size of successive patches. - c: number of channels in each image. - h_g: hidden layer size of the fc layer for `phi`. - h_l: hidden layer size of the fc layer for `l`. - std: standard deviation of the Gaussian policy. - hidden_size: hidden size of the rnn. - num_classes: number of classes in the dataset. - num_glimpses: number of glimpses to take per image, i.e. number of BPTT steps. - kernel_size: list of int, convolutional kernel size in stacked RAM - num_stacks: int, number of layers in stacked RAM - stack_attn_mode: str, values chosen from 'separate', 'concat', 'combine' """ super(RecurrentAttention, self).__init__() self.std = std self.num_stacks = num_stacks self.stack_attn_mode = stack_attn_mode self.sensor = nn.ModuleList([ glimpse_network(h_g, h_l, g, k, s, c, kernel_size) for _ in range(num_stacks) ]) self.rnn = nn.ModuleList([ core_network(h_g + h_l, hidden_size) for _ in range(num_stacks) ]) if stack_attn_mode == 'separate': self.locator = nn.ModuleList([ location_network(hidden_size, 2, std) for _ in range(num_stacks) ]) elif stack_attn_mode == 'concat': self.locator = location_network(hidden_size * num_stacks, 2, std) elif stack_attn_mode == 'combine': self.locator = location_network(hidden_size * num_stacks, 2 * num_stacks, std) else: raise 'Unknown stack_attn_mode [%s]' % stack_attn_mode self.baseliner = nn.ModuleList([ baseline_network(hidden_size, 1) for _ in range(num_stacks) ]) self.classifier = action_network(hidden_size * num_stacks, num_classes)
def __init__(self, g, c, image_size, std, hidden_size, num_classes, config): """ Initialize the recurrent attention model and its different components. Args ---- - g: size of the square patches in the glimpses extracted by the retina. - c: number of channels in each image. - image_size: a tuple: (H x W) - std: standard deviation of the Gaussian policy. - hidden_size: hidden size of the rnn. - num_classes: number of classes in the dataset. - num_glimpses: number of glimpses to take per image, i.e. number of BPTT steps. """ super(RecurrentAttention, self).__init__() # when the locations l is defined by a Gaussian distribution self.std = std # when the locations l is defined by a symmetry stable distribution self.alpha = config.alpha self.gamma = config.gamma self.config = config self.context = context_network(c, config.kernel_size, hidden_size) self.sensor = glimpse_network(hidden_size, g, c, config) self.rnn = core_network(hidden_size, hidden_size, config) self.top_down_locator = location_network(hidden_size, 2, config) self.bot_up_locator = Levy_bottom_up_generator(config.batch_size, image_size, config) self.combine_location = combine_location_network(hidden_size, config) self.classifier = action_network(hidden_size, num_classes) self.baseliner = baseline_network(hidden_size, 1) # something for initialzing subroutine dtype = (torch.cuda.FloatTensor if self.config.use_gpu else torch.FloatTensor) # derivative of Saliecy map self.derivative_y = torch.tensor([-1, 0, 1]).reshape(1, 1, 3, 1).type(dtype) self.derivative_x = torch.t(torch.tensor([-1, 0, 1])).reshape(1, 1, 1, 3).type(dtype) # a weighted saliency s gauged at a fixation center self.gaussian_kernel_sigma = math.floor( image_size[0] / 12) # in the paper, /6 but pytorch does not accept such big kernel gaussian_kernel_size = self.gaussian_kernel_sigma * 2 + 1 tmp_x, tmp_y = torch.meshgrid( torch.arange(-self.gaussian_kernel_sigma, self.gaussian_kernel_sigma + 1).type(dtype), torch.arange(-self.gaussian_kernel_sigma, self.gaussian_kernel_sigma + 1).type(dtype)) self.gaussian_kernel = torch.exp( -(tmp_x.type(dtype)**2 + tmp_y.type(dtype)**2) / self.gaussian_kernel_sigma**2).reshape(1, 1, gaussian_kernel_size, gaussian_kernel_size)