Beispiel #1
0
    def __init__(self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim,
                 rnn_hidden_size, rnn_dropout, max_len, fc_bias=True, max_navigable=16):
        super(Configuring, self).__init__()

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size
        self.max_len = max_len

        proj_navigable_kwargs = {
            'input_dim': img_feat_input_dim,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias,
            'relu': opts.mlp_relu
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias)
        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)

        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)

        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size)

        self.lang_position = PositionalEncoding(rnn_hidden_size, dropout=0.1, max_len=max_len)

        self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1])

        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias)

        self.r_linear = nn.Linear(rnn_hidden_size + 128, 2)
    
        self.sm = nn.Softmax(dim=1)

        self.num_predefined_action = 1

        self.state_attention = StateAttention()

        self.config_fc = nn.Linear(768, 512, bias=False)

        if opts.monitor_sigmoid:
            self.critic = nn.Sequential(
                #nn.Linear(max_len + rnn_hidden_size, 1),
                nn.Linear(10 + rnn_hidden_size, 1),
                nn.Sigmoid()
            )
        else:
            self.critic = nn.Sequential(
               # nn.Linear(max_len + rnn_hidden_size, 1),
                nn.Linear(10 + rnn_hidden_size, 1),
                nn.Tanh()
            )
Beispiel #2
0
    def __init__(self,
                 opts,
                 img_fc_dim,
                 img_fc_use_batchnorm,
                 img_dropout,
                 img_feat_input_dim,
                 rnn_hidden_size,
                 rnn_dropout,
                 max_len,
                 fc_bias=True,
                 max_navigable=16):
        super(Regretful, self).__init__()

        self.opts = opts
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable

        proj_navigable_kwargs = {
            'input_dim': img_feat_input_dim,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1])

        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)
        self.positional_encoding = PositionalEncoding(rnn_hidden_size,
                                                      dropout=0.1,
                                                      max_len=max_len)
        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)
        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size,
                                rnn_hidden_size)

        self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1])

        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1],
                                    rnn_hidden_size,
                                    bias=fc_bias)

        self.critic_fc = nn.Linear(max_len + rnn_hidden_size, 1)
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()

        self.critic_valueDiff_fc = nn.Linear(1, 2)
        self.relu = nn.ReLU(inplace=True)
        self.softmax = nn.Softmax(dim=1)

        self.move_fc = nn.Linear(img_fc_dim[-1],
                                 img_fc_dim[-1] + opts.tiled_len)

        self.num_predefined_action = 1
    def __init__(self,
                 opts,
                 img_fc_dim,
                 img_fc_use_batchnorm,
                 img_dropout,
                 img_feat_input_dim,
                 rnn_hidden_size,
                 rnn_dropout,
                 max_len,
                 fc_bias=True,
                 max_navigable=16):
        super(SpeakerFollowerBaseline, self).__init__()

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size

        self.proj_img_mlp = nn.Linear(img_feat_input_dim,
                                      img_fc_dim[-1],
                                      bias=fc_bias)

        self.proj_navigable_mlp = nn.Linear(img_feat_input_dim,
                                            img_fc_dim[-1],
                                            bias=fc_bias)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=False)

        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)

        self.lstm = nn.LSTMCell(img_feat_input_dim * 2, rnn_hidden_size)

        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=False)

        self.proj_out = nn.Linear(rnn_hidden_size,
                                  img_fc_dim[-1],
                                  bias=fc_bias)
Beispiel #4
0
    def __init__(self, opts, img_fc_dim, img_fc_use_batchnorm, img_dropout, img_feat_input_dim,
                 rnn_hidden_size, rnn_dropout, max_len, fc_bias=True, max_navigable=16):
        super(ConfiguringObject, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size

        proj_navigable_obj_kwargs = {
            'input_dim': 152,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias,
            'relu': opts.mlp_relu
        }
        self.proj_navigable_obj_mlp = build_mlp(**proj_navigable_obj_kwargs)

        proj_navigable_img_kwargs = {
            'input_dim': img_feat_input_dim,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias,
            'relu': opts.mlp_relu
        }
        self.proj_navigable_img_mlp = build_mlp(**proj_navigable_img_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=False)

        self.soft_attn = SoftAttention()
        
        self.state_attention = StateAttention()

        self.config_obj_attention = ConfigObjAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)
        
        #self.lstm = nn.LSTMCell(img_fc_dim[-1] + 768, rnn_hidden_size)
        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size, rnn_hidden_size)


        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=False)

        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1], rnn_hidden_size, bias=fc_bias)

        self.proj_out = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias)

        self.state_attention = StateAttention()

       # self.logit_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1])
        self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1])

        self.r_linear = nn.Linear(rnn_hidden_size + 128, 4)

        self.image_linear = nn.Linear(img_feat_input_dim, img_fc_dim[-1])

        self.config_fc = nn.Linear(768, 512, bias=False)

        self.config_atten_linear = nn.Linear(512, 128)
        #self.config_atten_linear = nn.Linear(768, 128)

        self.sm = nn.Softmax(dim=1)

        if opts.monitor_sigmoid:
            self.critic = nn.Sequential(
                #nn.Linear(max_len + rnn_hidden_size, 1),
                nn.Linear(10 + rnn_hidden_size, 1),
                nn.Sigmoid()
            )
        else:
            self.critic = nn.Sequential(
               # nn.Linear(max_len + rnn_hidden_size, 1),
                nn.Linear(10 + rnn_hidden_size, 1),
                nn.Tanh()
            )

        self.r_transform = Variable(torch.tensor([[1,0,0.75,0.5],[0,1,0.25,0.5]]).transpose(0,1), requires_grad=False)
    def __init__(
        self,
        opts,
        img_fc_dim,
        img_fc_use_batchnorm,
        img_dropout,
        img_feat_input_dim,
        rnn_hidden_size,
        rnn_dropout,
        max_len,
        film_size=2048,
        fc_bias=True,
        max_navigable=16,
        conv_hidden=2048,
        num_resblocks=8,
    ):
        super(SelfMonitoring, self).__init__()

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size
        self.max_len = max_len

        proj_navigable_kwargs = {
            "input_dim": img_feat_input_dim,
            "hidden_dims": img_fc_dim,
            "use_batchnorm": img_fc_use_batchnorm,
            "dropout": img_dropout,
            "fc_bias": fc_bias,
            "relu": opts.mlp_relu,
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias)
        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)

        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)

        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size,
                                rnn_hidden_size)

        self.lang_position = PositionalEncoding(rnn_hidden_size,
                                                dropout=0.1,
                                                max_len=max_len)

        self.logit_fc = nn.Linear(film_size, img_fc_dim[-1])

        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1],
                                    rnn_hidden_size,
                                    bias=fc_bias)

        if opts.monitor_sigmoid:
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Sigmoid())
        else:
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Tanh())

        self.num_predefined_action = 1

        # EDIT: add FiLM
        self.resnet = torch.hub.load("pytorch/vision:v0.5.0",
                                     "resnet152",
                                     pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-2])

        self.film_gen = FiLMGenerator(
            context_size=rnn_hidden_size,
            num_resblocks=num_resblocks,
            conv_hidden=conv_hidden,
        )

        self.film = FiLMedResBlocks(
            num_blocks=num_resblocks,
            conv_hidden=conv_hidden,
            with_batch_norm=True,
        )

        self.film_tail = nn.AdaptiveAvgPool2d(1)
    def __init__(self,
                 opts,
                 img_fc_dim,
                 img_fc_use_batchnorm,
                 img_dropout,
                 img_feat_input_dim,
                 rnn_hidden_size,
                 rnn_dropout,
                 max_len,
                 fc_bias=True,
                 max_navigable=16):
        super(SelfMonitoring, self).__init__()

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.max_navigable = max_navigable
        self.feature_size = img_feat_input_dim
        self.hidden_size = rnn_hidden_size
        self.max_len = max_len

        proj_navigable_kwargs = {
            'input_dim': img_feat_input_dim,
            'hidden_dims': img_fc_dim,
            'use_batchnorm': img_fc_use_batchnorm,
            'dropout': img_dropout,
            'fc_bias': fc_bias,
            'relu': opts.mlp_relu
        }
        self.proj_navigable_mlp = build_mlp(**proj_navigable_kwargs)

        self.h0_fc = nn.Linear(rnn_hidden_size, img_fc_dim[-1], bias=fc_bias)
        self.h1_fc = nn.Linear(rnn_hidden_size, rnn_hidden_size, bias=fc_bias)

        self.soft_attn = SoftAttention()

        self.dropout = nn.Dropout(p=rnn_dropout)

        self.lstm = nn.LSTMCell(img_fc_dim[-1] * 2 + rnn_hidden_size,
                                rnn_hidden_size)
        self.lstm = nn.LSTMCell(2587, rnn_hidden_size)

        self.lang_position = PositionalEncoding(rnn_hidden_size,
                                                dropout=0.1,
                                                max_len=max_len)

        self.logit_fc = nn.Linear(rnn_hidden_size * 2, img_fc_dim[-1])
        self.h2_fc_lstm = nn.Linear(rnn_hidden_size + img_fc_dim[-1],
                                    rnn_hidden_size,
                                    bias=fc_bias)

        if opts.monitor_sigmoid:  #NOT HERE
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Sigmoid())
        else:  #THIS
            self.critic = nn.Sequential(
                nn.Linear(max_len + rnn_hidden_size, 1), nn.Tanh())

        self.num_predefined_action = 1
        self.object_t_size = 17  # object_size
        self.place_t_size = 10  # place_Size

        self.object_attention_layer = ObjectDotAttention(
            rnn_hidden_size, self.object_t_size)
        self.place_attention_layer = VisualSoftDotAttention(
            rnn_hidden_size, self.place_t_size)