def forward(self, iter_num, batch_sentence, batch_verb, vocab):
     sen_embed = self.sen_embed(batch_sentence)
     verb_embed = self.verb_embed(batch_verb)
     sen_embed = self.sen_embed_linear(sen_embed)
     # sen_embed = self.sen_embed_dropout(sen_embed)
     verb_embed = self.verb_embed_linear(verb_embed)
     # verb_embed = self.verb_embed_dropout(verb_embed)
     embed = torch.cat((sen_embed, verb_embed), dim=1)
     embed = self.cat_embed_linear(embed)
     embed = self.cat_embed_dropout(embed)
     ### embed.size() -> [batch_size, max_seq_len, common_size] ###
     src = embed.permute(1, 0, 2)
     ### scr.size() -> [max_seq_len, batch_size, common_size] ###
     tgt = embed.permute(1, 0, 2)
     ### tgt.size() -> [max_seq_len, batch_size, common_size] ###
     output = self.transformer(src, tgt)
     ### output.size() -> [max_seq_len, batch_size, common_size] ###
     sen2vec = self.output_linear(output.permute(1, 2, 0)).permute(2, 0, 1)
     sen2vec = l2norm(sen2vec)
     ### sen2vec.size() -> [1, batch_size, common_size] ###
     sen2vec = self.sen2vec_linear(sen2vec)
     sen2vec = self.sen2vec_dropout(sen2vec)
     sen2vec = l2norm(sen2vec)
     sen2vec = sen2vec.squeeze(dim=0)
     # print('sen2vec:', sen2vec.size(), src.size(), tgt.size(), output.size(), output.permute(1, 0, 2).size())
     # if iter_num % 100 == 0:
     #     print('sen2vec:', sen2vec)
     return sen2vec, output.permute(1, 0, 2)
Exemple #2
0
    def forward(self, x_id, im, x):
        x_id_emb = self.embedding(x_id)
        im = self.linear(im)

        x_w2v = torch.zeros(*x_id_emb.size())
        x_cat = None
        if self.model_options['concat']:
            for i, text in enumerate(x):
                for j, word in enumerate(text.split()):
                    try:
                        x_w2v[j, i] = torch.from_numpy(
                            wvModel[word.decode('utf8')])
                    except KeyError:
                        pass
            x_w2v = Variable(x_w2v.cuda())
            x_cat = torch.cat([x_id_emb, x_w2v])
        else:
            x_cat = x_id_emb

        if self.model_options['encoder'] == 'bow':
            x_cat = x_cat.sum(0).squeeze(0)
        else:
            _, (x_cat, _) = self.lstm(x_cat)
            x_cat = x_cat.squeeze(0)

        return l2norm(x_cat), l2norm(im)
    def forward(self, iter_num, batch_sentence, batch_verb, vocab):
        words_num = batch_sentence.ne(vocab.padidx).sum(dim=1).to(device)
        verbs_num = batch_verb.ne(vocab.padidx).sum(dim=1).to(device)
        sen_embed = self.sen_embed(batch_sentence)
        verb_embed = self.verb_embed(batch_verb)

        # sen_embed, sen_embed_attn = self.sen_embed_attention(
        #     sen_embed, sen_embed)  # embed_attention #
        # verb_embed, verb_embed_attn = self.verb_embed_attention(
        #     verb_embed, sen_embed)  # embed_attention #

        sen_output, (sen_h_n, sen_c_n) = self.sen_lstm(
            sen_embed,
            self.sen_h0.permute(1, 0, 2).contiguous(),
            self.sen_c0.permute(1, 0, 2).contiguous())
        verb_output, (verb_h_n, verb_c_n) = self.verb_lstm(
            verb_embed,
            self.verb_h0.permute(1, 0, 2).contiguous(),
            self.verb_c0.permute(1, 0, 2).contiguous())

        ### sen_output.size() -> [batch_size, max_seq_len, hidden_size*2] ###

        ### sen_output_cat.size() -> [batch_size, max_seq_len+max_verb_len, hidden_size*2] ###
        # output_cat = torch.cat((sen_output, verb_output), dim=1)

        # self_attention = True
        self_attention = False

        if self_attention == True:
            output, output_attn = self.lstm_attention(
                sen_output, verb_output)  # lstm_attention #
            sen2vec = output.permute(0, 2, 1)
            sen2vec = self.len_linear(sen2vec).squeeze(dim=2)
        else:
            sen2vec = torch.stack(
                [sen_output[i, j - 1, :] for i, j in enumerate(words_num)],
                dim=0)
            verb2vec = torch.stack(
                [verb_output[i, j - 1, :] for i, j in enumerate(verbs_num)],
                dim=0)
            sen2vec = torch.cat((sen2vec, verb2vec), dim=1)
            sen2vec = self.cat_linear(sen2vec)

        sen2vec = self.sen2vec_linear(sen2vec)
        # sen2vec, sen2vec_attn = self.sen2vec_attention(sen2vec, sen2vec)
        sen2vec = self.sen2vec_dropout(sen2vec)
        sen2vec = l2norm(sen2vec)
        sen_out = torch.cat((sen_output, verb_output), dim=1)
        sen_out = self.sen_out_linear(sen_out)
        sen_out = self.sen_out_dropout(sen_out)
        sen_out = l2norm(sen_out)
        sen_h = torch.cat((sen_h_n, verb_h_n), dim=2)
        sen_h = self.sen_h_linear(sen_h)
        sen_h = self.sen_h_dropout(sen_h)
        sen_h = l2norm(sen_h)
        # print('sen2vec:', sen2vec.size(), sen_out.size(), sen_h.size())
        # if iter_num % 100 == 0:
        #     print('sen2vec:', sen2vec)
        return sen2vec, sen_out, sen_h
Exemple #4
0
    def forward(self, x, im):
        x_emb = self.embedding(x)
        im = self.linear(im)

        _, (x_emb, _) = self.lstm(x_emb)
        x_emb = x_emb.squeeze(0)

        return l2norm(x_emb), l2norm(im)
Exemple #5
0
    def forward(self, en, cn):
        en_embed = self.embedding(en)
        cn = self.linear(cn)

        _, (en_embed, _) = self.lstm(en_embed)
        en_embed = en_embed.squeeze(0)

        return l2norm(en_embed), l2norm(cn)
Exemple #6
0
    def forward(self, x, im):
        x = self.embedding(x)
        im = self.linear(im)

        if self.model_options['encoder'] == 'bow':
            x = x.sum(0).squeeze(0)
        else:
            _, (x, _) = self.lstm(x)
            x = x.squeeze(0)

        return l2norm(x), l2norm(im)
Exemple #7
0
def evaluate_i2t(image_mha, image_encoder, bert_model, text_encoder,
                 image_dataloader, text_dataloader, ks):
    with torch.no_grad():
        all_text_features = []
        text_index = 0
        res_dict = dict()
        for filenames, input_ids, attention_masks in text_dataloader:
            for filename in filenames:
                image_id = int(re.findall(r'\d{12}', filename)[0])
                if image_id not in res_dict:
                    res_dict[image_id] = []
                res_dict[image_id].append(text_index)
                text_index += 1
            # Get text features
            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)
            text_features = bert_model(input_ids,
                                       attention_mask=attention_masks)
            text_features = l2norm(text_features)
            text_features = text_encoder(text_features)
            all_text_features.append(text_features)
        all_text_features = torch.cat(all_text_features, dim=0)
        recall = np.zeros(len(ks))
        max_k = max(ks)
        total_query = 0
        pbar = tqdm(enumerate(image_dataloader),
                    total=len(image_dataloader),
                    leave=False,
                    position=0,
                    file=sys.stdout)
        for i, (image_ids, features) in pbar:
            mha_features = []
            for feature in features:
                feature = l2norm(feature.to(device))
                feature = l2norm(image_mha(feature))
                feature = torch.mean(feature, dim=0, keepdim=True)
                mha_features.append(feature)
            mha_features = torch.cat(mha_features, dim=0)
            image_features = image_encoder(mha_features)
            all_indices = get_top_k_eval(image_features, all_text_features,
                                         max_k)
            for idx, indices in enumerate(all_indices):
                total_query += 1
                image_id = image_ids[idx].item()
                true_text_indices = torch.tensor(res_dict[image_id])

                for i, k in enumerate(ks):
                    top_k_text = indices[:k].to('cpu')
                    relevant_text = np.intersect1d(top_k_text,
                                                   true_text_indices)
                    if relevant_text.shape[0] > 0:
                        recall[i] += 1
        recall = recall / total_query
    return recall
Exemple #8
0
    def forward(self, image, verb_id):
        '''print('testing 123')
        x = torch.tensor([[1, 2, 3],[4,5,6]])
        print('original', x.size())
        x = x.repeat(1,2)
        print('xxxxxx', x, x.view(-1,3), x.size())'''

        conv = self.conv(image)

        #verb pred
        verb_rep = self.verb(conv)
        verb_embedding = self.verb_transform(self.verb_lookup(verb_id))

        return utils.l2norm(verb_rep), utils.l2norm(verb_embedding)
Exemple #9
0
    def forward(self, en, en_lengths, en_index, cn, cn_lengths, cn_index):
        """
        Input Variable:
            input_var: A variables whose size is (B,W), B is the batch size and W is the longest sequence length in the batch
            input_lengths: The lengths of each element in the batch.
            hidden: The hidden state variable whose size is (num_layer*num_directions,batch_size,hidden_size)
        Output:
            output: A variable with tensor size W*B*N, W is the maximum length of the batch, B is the batch size, and N is the hidden size
            hidden: The hidden state variable with tensor size (num_layer*num_direction,B,N)
        """
        en = self.sorted_forward(en, en_lengths, en_index)
        cn = self.sorted_forward(cn, cn_lengths, cn_index)

        return l2norm(en), l2norm(cn)
Exemple #10
0
    def forward_loss(self, img_span_features, cap_span_features, img_lengths,
                     txt_lengths, img_span_bounds, txt_span_bounds,
                     img_span_margs, txt_span_margs):
        b = img_span_features.size(0)
        N_txt = txt_lengths.max(0)[0]
        mstep_txt = (txt_lengths * 2).int()
        # focus on only short spans
        nstep_txt = int(mstep_txt.float().mean().item())

        N_img = img_lengths.max(0)[0]
        mstep_img = (img_lengths * 2).int()
        # focus on only short spans
        nstep_img = int(mstep_img.float().mean().item())

        matching_loss_matrix = torch.zeros(b,
                                           nstep_img,
                                           nstep_txt,
                                           device=img_span_features.device)
        similarity_matrix = torch.zeros(b,
                                        b,
                                        nstep_img,
                                        nstep_txt,
                                        device=img_span_features.device)

        for j in range(nstep_img):
            for k in range(nstep_txt):
                cap_emb = cap_span_features[:, k]
                img_emb = img_span_features[:, j]
                cap_marg = txt_span_margs[:, k].softmax(-1).unsqueeze(-2)
                cap_emb = torch.matmul(cap_marg, cap_emb).squeeze(-2)

                img_marg = img_span_margs[:, j].softmax(-1).unsqueeze(-2)
                img_emb = torch.matmul(img_marg, img_emb).squeeze(-2)

                cap_emb = utils.l2norm(cap_emb)
                img_emb = utils.l2norm(img_emb)
                similarity_matrix[:, :, j,
                                  k] = self.similarity(img_emb, cap_emb)

        img_span_margs = img_span_margs.sum(-1).unsqueeze(2).unsqueeze(1)
        txt_span_margs = txt_span_margs.sum(-1).unsqueeze(1).unsqueeze(0)

        expected_similarity = img_span_margs[:, :, :
                                             nstep_img, :] * txt_span_margs[:, :, :, :
                                                                            nstep_txt] * similarity_matrix
        expected_similarity = expected_similarity.sum([-2, -1])

        expected_loss = self.contrastive(expected_similarity)
        return expected_loss
Exemple #11
0
def evaluate_t2i(image_mha, image_encoder, bert_model, text_encoder,
                 image_dataloader, text_dataloader, ks):
    # Load image features
    with torch.no_grad():
        image_features = []
        image_ids = []
        for ids, features in image_dataloader:
            image_ids.append(torch.stack(ids))
            mha_features = []
            for feature in features:
                feature = l2norm(feature.to(device))
                feature = l2norm(image_mha(feature))
                feature = torch.mean(feature, dim=0, keepdim=True)
                mha_features.append(feature)
            mha_features = torch.cat(mha_features, dim=0)
            image_features.append(image_encoder(mha_features))
        image_features = torch.cat(image_features, dim=0)
        image_ids = torch.cat(image_ids, dim=0).to(device)
        # Evaluate
        max_k = max(ks)
        recall = np.zeros(len(ks))
        total_query = 0
        pbar = tqdm(enumerate(text_dataloader),
                    total=len(text_dataloader),
                    leave=False,
                    position=0,
                    file=sys.stdout)
        for i, (image_files, input_ids, attention_mask) in pbar:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            text_features = bert_model(input_ids,
                                       attention_mask=attention_mask)
            text_features = l2norm(text_features)
            text_features = text_encoder(text_features)
            image_files = torch.tensor(
                list(
                    map(lambda x: int(re.findall(r'\d{12}', x)[0]),
                        image_files))).to(device)
            top_k = get_top_k_eval(text_features, image_features, max_k)
            for idx, indices in enumerate(top_k):
                total_query += 1
                true_image_id = image_files[idx]
                sorted_image_ids = torch.gather(image_ids, 0, indices)
                for i, k in enumerate(ks):
                    top_k_images = sorted_image_ids[:k]
                    if (top_k_images == true_image_id).nonzero().numel() > 0:
                        recall[i] += 1
        recall = recall / total_query
        return recall
Exemple #12
0
def build_model(tparams, options):                                                                                           
    """
    Computation graph for the model
    """
    opt_ret = dict()
    trng = RandomStreams(1234)

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    im = tensor.matrix('im', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # Word embedding (source)
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])

    # Encode sentences (source)
    if options['encoder'] == 'bow':
        sents = (emb * mask[:,:,None]).sum(0)
    else:
        proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
                                                prefix='encoder',
                                                mask=mask)
        sents = proj[0][-1]
    sents = l2norm(sents)

    # Encode images (source)
    images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')

    # Compute loss
    cost = contrastive_loss(options['margin'], images, sents)

    return trng, [x, mask, im], cost
Exemple #13
0
def build_sentence_encoder(tparams, options):
    """
    Encoder only, for sentences
    """
    opt_ret = dict()

    trng = RandomStreams(1234)

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('x_mask', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # Word embedding
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])

    # Encode sentences
    if options['encoder'] == 'bow':
        sents = (emb * mask[:, :, None]).sum(0)
    else:
        proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
                                                prefix='encoder',
                                                mask=mask)
        sents = proj[0][-1]
    sents = l2norm(sents)

    return trng, [x, mask], sents
Exemple #14
0
    def forward(self, features, img_lengths):
        b, N, _ = features.size()
        # b, N, _, _, _ = input.size()
        # input = input.reshape(-1, input.size(-3), input.size(-2), input.size(-1))
        # features = self.conv(input)
        # features = F.relu(features)
        # features = features.reshape(features.size(0), -1)
        features = self.fc1(features)
        dim = features.size(-1)
        assert N == img_lengths.max()
        feats = torch.zeros(b,
                            int(N * (N - 1) / 2),
                            self.NT,
                            self.sem_dim,
                            device=features.device)
        beg_idx = 0
        for k in range(1, N):
            inc = torch.arange(N - k, device=features.device).view(
                N - k, 1)  #.expand(N - k, k + 1)
            idx = torch.arange(k + 1,
                               device=features.device).view(1, k + 1).repeat(
                                   N - k, 1)
            idx = (idx + inc).view(-1)
            idx = idx.unsqueeze(0).unsqueeze(-1).expand(b, -1, dim)

            feat = torch.gather(features, 1, idx)
            feat = feat.view(b, N - k, k + 1, dim)
            feat = feat.unsqueeze(3).expand(b, N - k, k + 1, self.NT,
                                            self.sem_dim)
            feat = feat.view(b, N - k, k + 1, self.NT, self.sem_dim)
            feat = l2norm(feat.sum(2))
            end_idx = beg_idx + N - k
            feats[:, beg_idx:end_idx] = feat
            beg_idx = end_idx
        return feats
Exemple #15
0
def encode_images(tparams, options, im):
    im_emb = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')
    im_emb = l2norm(im_emb)
    if options['abs']:
        im_emb = abs(im_emb)

    return im_emb
Exemple #16
0
    def _read(self, feat_path, meta_path, proposal_folders):
        fn_node_pattern = '*_node.npz'
        fn_edge_pattern = '*_edge.npz'

        with Timer('read meta and feature'):
            self.lb2idxs, self.idx2lb = read_meta(meta_path)
            inst_num = len(self.idx2lb)
            if not self.featureless:
                features = read_probs(feat_path, inst_num, self.feature_dim)
                self.features = l2norm(features)
            else:
                self.feature_dim = 1
                self.features = np.ones(inst_num).reshape(-1, 1)

        with Timer('read proposal list'):
            self.lst = []
            for proposal_folder in proposal_folders:
                print('read proposals from folder: ', proposal_folder)
                fn_nodes = sorted(
                    glob.glob(os.path.join(proposal_folder, fn_node_pattern)))
                fn_edges = sorted(
                    glob.glob(os.path.join(proposal_folder, fn_edge_pattern)))
                assert len(fn_nodes) == len(
                    fn_edges), "node files({}) vs edge files({})".format(
                        len(fn_nodes), len(fn_edges))
                assert len(fn_nodes) > 0, 'files under {} is 0'.format(
                    proposal_folder)
                for fn_node, fn_edge in zip(fn_nodes, fn_edges):
                    assert fn_node[:fn_node.rfind(
                        '_')] == fn_edge[:fn_edge.rfind('_'
                                                        )], "{} vs {}".format(
                                                            fn_node, fn_edge)
                    self.lst.append([fn_node, fn_edge])
            self.size = len(self.lst)
Exemple #17
0
    def forward(self, input):
        output = self.linear(input)

        if self.l2_norm:
            output = l2norm(output)

        return output
Exemple #18
0
def encode_images(tparams, options, im):
    im_emb = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')
    im_emb = l2norm(im_emb)
    if options['abs']:
        im_emb = abs(im_emb)

    return im_emb
Exemple #19
0
def image_template_feature(img_feats, template, media, choose_templates,
                           choose_ids):
    template = np.array(template, np.int)
    media = np.array(media, np.int)
    unique_templates, indices = np.unique(choose_templates, return_index=True)
    unique_subjectids = choose_ids[indices]
    template_feats = []
    for uqt in tqdm(unique_templates):
        ind_t = np.where(template == uqt)
        face_norm_feats = img_feats[ind_t]
        face_medias = media[ind_t]
        unique_medias, unique_media_counts = np.unique(face_medias,
                                                       return_counts=True)
        media_norm_feats = []
        for u, ct in zip(unique_medias, unique_media_counts):
            ind_m = np.where(face_medias == u)
            if ct == 1:
                media_norm_feats.append(face_norm_feats[ind_m])
            else:
                media_norm_feats.append(
                    np.mean(face_norm_feats[ind_m], axis=0, keepdims=True))
        media_norm_feats = np.array(media_norm_feats)
        template_feats.append(np.sum(media_norm_feats, axis=0))
    template_feats = np.concatenate(template_feats, axis=0)
    template_norm_feats = l2norm(template_feats)

    return template_norm_feats, unique_templates, unique_subjectids
def build_model(tparams, options):                                                                                           
    """
    Computation graph for the model
    """
    opt_ret = dict()
    trng = RandomStreams(1234)

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    im = tensor.matrix('im', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # Word embedding (source)
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])

    # Encode sentences (source)
    if options['encoder'] == 'bow':
        sents = (emb * mask[:,:,None]).sum(0)
    else:
        proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
                                                prefix='encoder',
                                                mask=mask)
        sents = proj[0][-1]
    sents = l2norm(sents)

    # Encode images (source)
    images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')

    # Compute loss
    cost = contrastive_loss(options['margin'], images, sents)

    return trng, [x, mask, im], cost
Exemple #21
0
def build_image_encoder(tparams, options):
    """
    Encoder only, for images
    """
    opt_ret = dict()

    trng = RandomStreams(1234)

    # image features
    im = tensor.matrix('im', dtype='float32')

    # Encode images
    images_mm = get_layer('ff')[1](tparams,
                                   im,
                                   options,
                                   prefix='ff_image_mm',
                                   activ='linear')
    if not 'attention_type' in options or options['attention_type'] == 'dot':
        images_mm = l2norm(images_mm)

    if options['use_dropout']:
        images_mm *= shared_dropout_layer(
            (n_samples, options['dim_multimodal']), use_noise, trng,
            retain_probability_hidden)

    return trng, [im], images_mm
def build_sentence_encoder(tparams, options):
    """
    Encoder only, for sentences
    """
    opt_ret = dict()

    trng = RandomStreams(1234)

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('x_mask', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # Word embedding
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])

    # Encode sentences
    if options['encoder'] == 'bow':
        sents = (emb * mask[:,:,None]).sum(0)
    else:
        proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
                                                prefix='encoder',
                                                mask=mask)
        sents = proj[0][-1]
    sents = l2norm(sents)

    return trng, [x, mask], sents
Exemple #23
0
    def get_approx_min_longest_edge(simplex, L):
        '''n->1 l.b.m.approx.: max(f(A) - L*|AB|, f(B) - L*|AB|)'''
        A = simplex[0]
        B = simplex[1]

        AB_dist = l2norm(A[:-1], B[:-1])
        return max([A[-1]['obj'] - L*AB_dist,  B[-1]['obj'] - L*AB_dist])
    def _read(self, feat_path, label_path, proposal_folders):
        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.labels = intdict2ndarray(self.idx2lb)
                self.inst_num = len(self.idx2lb)
                self.ignore_label = False
            else:
                self.lb2idxs, self.idx2lb = None, None
                self.labels = None
                self.inst_num = -1
                self.ignore_label = True
            if not self.featureless:
                features = read_probs(feat_path, self.inst_num,
                                      self.feature_dim)
                self.features = l2norm(features)
                if self.inst_num == -1:
                    self.inst_num = features.shape[0]
            else:
                assert self.inst_num > 0
                self.feature_dim = 1
                self.features = np.ones(self.inst_num).reshape(-1, 1)

        with Timer('read proposal list'):
            self.lst = []
            self.tot_lst = []
            if callable(proposal_folders):
                proposal_folders = proposal_folders()
            for proposal_folder in proposal_folders:
                print('read proposals from folder: ', proposal_folder)
                fn_nodes = sorted(
                    glob.glob(osp.join(proposal_folder, self.fn_node_pattern)))
                fn_edges = sorted(
                    glob.glob(osp.join(proposal_folder, self.fn_edge_pattern)))
                assert len(fn_nodes) == len(
                    fn_edges), "node files({}) vs edge files({})".format(
                        len(fn_nodes), len(fn_edges))
                assert len(fn_nodes) > 0, 'files under {} is 0'.format(
                    proposal_folder)
                for fn_node, fn_edge in zip(fn_nodes, fn_edges):
                    # sanity check
                    assert fn_node[:fn_node.rfind(
                        '_')] == fn_edge[:fn_edge.rfind('_'
                                                        )], "{} vs {}".format(
                                                            fn_node, fn_edge)
                    if self._check_iop(fn_node):
                        self.lst.append([fn_node, fn_edge])
                    self.tot_lst.append([fn_node, fn_edge])

            self.size = len(self.lst)
            self.tot_size = len(self.tot_lst)
            assert self.size <= self.tot_size

            if self.size < self.tot_size:
                print('select {} / {} = {:.2f} proposals '
                      'with iop between ({:.2f}, {:.2f})'.format(
                          self.size, self.tot_size,
                          1. * self.size / self.tot_size, self.th_iop_min,
                          self.th_iop_max))
Exemple #25
0
 def forward_sens(self, x):
     x = self.embedding(x)
     if self.model_options['encoder'] == 'bow':
         x = x.sum(0).squeeze(0)
     else:
         _, (x, _) = self.lstm(x)
         x = x.squeeze(0)
     return l2norm(x)
 def evaluate(self, val_image_dataloader, val_text_dataloader, k):
     self.switch_to_eval()
     # Load image features
     with torch.no_grad():
         image_features = []
         image_ids = []
         for ids, features, image_attention_mask in val_image_dataloader:
             image_ids.append(torch.stack(ids))
             features = torch.stack(features).to(self.device)
             image_attention_mask = torch.stack(image_attention_mask).to(
                 self.device)
             features = l2norm(features).detach()
             mha_features = l2norm(
                 self.image_mha(features, image_attention_mask))
             image_features.append(self.image_encoder(mha_features))
             # image_features.append(mha_features)
         image_features = torch.cat(image_features, dim=0)
         image_ids = torch.cat(image_ids, dim=0).to(self.device)
         # Evaluate
         recall = 0
         total_query = 0
         pbar = tqdm(enumerate(val_text_dataloader),
                     total=len(val_text_dataloader),
                     leave=False,
                     position=0,
                     file=sys.stdout)
         for i, (image_files, input_ids, attention_mask) in pbar:
             input_ids = input_ids.to(self.device)
             attention_mask = attention_mask.to(self.device)
             text_features = self.bert_model(input_ids,
                                             attention_mask=attention_mask)
             text_features = l2norm(text_features)
             text_features = self.text_encoder(text_features)
             image_files = torch.tensor(
                 list(
                     map(lambda x: int(re.findall(r'\d{12}', x)[0]),
                         image_files))).to(device)
             top_k = get_top_k_eval(text_features, image_features, k)
             for idx, indices in enumerate(top_k):
                 total_query += 1
                 true_image_id = image_files[idx]
                 top_k_images = torch.gather(image_ids, 0, indices)
                 if (top_k_images == true_image_id).nonzero().numel() > 0:
                     recall += 1
         recall = recall / total_query
         return recall
def matmul_loss_function(batch_size, matmul_sim):
    loss_filter = torch.ones(batch_size, dtype=torch.float32) - \
        torch.eye(batch_size, dtype=torch.float32)
    loss_filter = loss_filter.to(device)
    matmul_loss = torch.mul(matmul_sim, loss_filter).to(device)
    matmul_loss = torch.abs(matmul_loss).to(device)
    matmul_loss = l2norm(matmul_loss)
    matmul_loss = torch.mean(matmul_loss).to(device)
    return matmul_loss
Exemple #28
0
    def forward(self, input):
        features = self.feature_extractor(input)

        output = self.linear(features.squeeze())

        if self.l2_norm:
            output = l2norm(output)

        return output
def encode_topic_vector2(tparams, options, topics):
    t_emb = get_layer('ff')[1](tparams, topics, options, prefix='ff_topic_vector2', activ='linear')
    t_emb = l2norm(t_emb)
    #t_emb = maxnorm2(t_emb)

    if options['abs']:
        #im_emb = abs(im_emb)
        t_emb = tensor.maximum(t_emb, 0)
        
    return t_emb
def encode_images(tparams, options, im):
    im_emb = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')
    #if options['v_norm'] == 'l2' :
    im_emb = l2norm(im_emb)
    #im_emb = maxnorm2(im_emb)
    if options['abs']:
        #im_emb = abs(im_emb)
        im_emb = tensor.maximum(im_emb, 0)
        
    return im_emb
Exemple #31
0
    def forward(self, images):
        """ extract image feature vectors """
        # assuming that the precomputed features are already l2-normalized
        features = self.fc(images.float())

        # normalize in the joint embedding space
        if not self.no_imgnorm:
            features = l2norm(features)

        return features
    def forward(self, image_feature, image_attention_mask, input_ids,
                attention_mask, epoch):
        if epoch > 1 and self.frozen:
            self.frozen = False
            del self.lr_scheduler_0
            torch.cuda.empty_cache()

        image_feature = l2norm(image_feature).detach()
        final_image_features = l2norm(
            self.image_mha(image_feature, image_attention_mask))
        text_feature = self.bert_model(input_ids,
                                       attention_mask=attention_mask)
        text_feature = l2norm(text_feature)
        if epoch == 1:
            text_feature = text_feature.detach()
            self.frozen = True
        image_to_common = self.image_encoder(final_image_features)
        # image_to_common = final_image_features
        text_to_common = self.text_encoder(text_feature)
        return image_to_common, text_to_common
Exemple #33
0
def get_tolerance(simplex, L):
    if type(simplex[-1]) == dict and simplex[-1].has_key('approx_min_ABC'):
        lbm = simplex[-1].get('approx_min_ABC')
    else:
        lbm = get_approx_lb_min(simplex, L)

    min_dist = None
    for v in simplex[:-1]:
        obj_dist = l2norm(v[-1]['obj'], lbm)
        if min_dist is None or obj_dist < min_dist:
            min_dist = obj_dist
    return min_dist
    def forward(self, sen2vec, vid2vec):
        # print('sen2vec.size: {}, vid2vec.size: {}'.format(
        #     sen2vec.size(), vid2vec.size()))
        if self.mode == 'simple':
            matmul_sim = self._matmul_similarity(sen2vec, vid2vec)
            cos_sim = self._cos_similarity(sen2vec, vid2vec)

        elif self.mode == 'multi':
            if sen2vec.size() != vid2vec.size():
                sen2vec = sen2vec.repeat_interleave(vid2vec.size(0), dim=0)
            multi_vec = torch.cat((sen2vec, vid2vec), dim=1)
            multi_vec = self.multi_linear(multi_vec)
            multi_vec = self.multi_dropout(multi_vec)
            multi_vec = l2norm(multi_vec)
            multi_sen2vec = torch.cat((sen2vec, multi_vec), dim=1)
            multi_sen2vec = self.sen_linear(multi_sen2vec)
            multi_sen2vec = self.sen_dropout(multi_sen2vec)
            multi_sen2vec = l2norm(multi_sen2vec)
            multi_vid2vec = torch.cat((vid2vec, multi_vec), dim=1)
            multi_vid2vec = self.vid_linear(multi_vid2vec)
            multi_vid2vec = self.vid_dropout(multi_vid2vec)
            multi_vid2vec = l2norm(multi_vid2vec)

            matmul_sim = torch.stack(
                (self._matmul_similarity(sen2vec, vid2vec),
                 self._matmul_similarity(multi_sen2vec, multi_vid2vec)),
                dim=0)
            matmul_sim = torch.mean(matmul_sim, dim=0)
            cos_sim = torch.stack(
                (self._cos_similarity(sen2vec, vid2vec),
                 self._cos_similarity(multi_sen2vec, multi_vid2vec)),
                dim=0)
            cos_sim = torch.mean(cos_sim, dim=0)

        else:
            matmul_sim = self._matmul_similarity(sen2vec, vid2vec)
            cos_sim = self._cos_similarity(sen2vec, vid2vec)
        # print('matmal, cos', matmul_sim.size(), cos_sim.size())

        return matmul_sim, cos_sim
Exemple #35
0
    def get_approx_min_max_angle(simplex, L):
        '''nD->nD Approximates nD simplexes lower bound minimum by extending longest age by 1/cos(max angle)'''
        def get_angle(A, B, C):
            '''Finds angle in radians between AB and BC vectors'''
            vec1 = a(A) - a(B)
            vec2 = a(C) - a(B)
            return np.arccos(np.dot((vec1), (vec2))/ (enorm(vec1) * enorm(vec2)))     # radians to degree: * 180/np.pi

        # Choose longest edge vertexes
        A = simplex[0]
        B = simplex[1]

        # Find maximum angles for each vertex
        A_angles = []
        B_angles = []

        for V in nm(simplex):
            if V != A and V != B:
                A_angles.append(get_angle(B[:-1], A[:-1], V[:-1]))
                B_angles.append(get_angle(A[:-1], B[:-1], V[:-1]))

        max_A_angle = max(A_angles)
        max_B_angle = max(B_angles)

        v1 = simplex[0]
        v2 = simplex[1]

        if type(simplex[-1]) == dict and simplex[-1].has_key['mins_AB']:
            mins_AB = simplex[-1]['mins_AB']
        else:
            mins_AB = find_mins_AB(simplex, L)

        return min([
            v1[-1]['obj'][0] - L*l2norm(nm(v1), mins_AB[0][:-1]) / np.cos(max_A_angle),
            v2[-1]['obj'][0] - L*l2norm(nm(v2), mins_AB[0][:-1]) / np.cos(max_B_angle)
        ])
    def _read(self, feat_path, label_path, proposal_folders):
        fn_node_pattern = '*_node.npz'
        fn_edge_pattern = '*_edge.npz'

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.labels = intdict2ndarray(self.idx2lb)
                self.inst_num = len(self.idx2lb)
                self.ignore_label = False
            else:
                self.lb2idxs, self.idx2lb = None, None
                self.labels = None
                self.inst_num = -1
                self.ignore_label = True
            if not self.featureless:
                features = read_probs(feat_path, self.inst_num,
                                      self.feature_dim)
                self.features = l2norm(features)
                if self.inst_num == -1:
                    self.inst_num = features.shape[0]
            else:
                assert self.inst_num > 0
                self.feature_dim = 1
                self.features = np.ones(self.inst_num).reshape(-1, 1)

        with Timer('read proposal list'):
            self.lst = []
            if callable(proposal_folders):
                proposal_folders = proposal_folders()
            for proposal_folder in proposal_folders:
                print('read proposals from folder: ', proposal_folder)
                fn_nodes = sorted(
                    glob.glob(os.path.join(proposal_folder, fn_node_pattern)))
                fn_edges = sorted(
                    glob.glob(os.path.join(proposal_folder, fn_edge_pattern)))
                assert len(fn_nodes) == len(
                    fn_edges), "node files({}) vs edge files({})".format(
                        len(fn_nodes), len(fn_edges))
                assert len(fn_nodes) > 0, 'files under {} is 0'.format(
                    proposal_folder)
                for fn_node, fn_edge in zip(fn_nodes, fn_edges):
                    assert fn_node[:fn_node.rfind(
                        '_')] == fn_edge[:fn_edge.rfind('_'
                                                        )], "{} vs {}".format(
                                                            fn_node, fn_edge)
                    self.lst.append([fn_node, fn_edge])
            self.size = len(self.lst)
Exemple #37
0
def encode_sentences(tparams, options, x, mask):
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # Word embedding (source)
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])

    # Encode sentences (source)
    proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
                                            prefix='encoder',
                                            mask=mask)
    s = l2norm(proj[0][-1])
    if options['abs']:
        s = abs(s)

    return s
def build_image_encoder(tparams, options):
    """
    Encoder only, for images
    """
    opt_ret = dict()

    trng = RandomStreams(1234)

    # image features
    im = tensor.matrix('im', dtype='float32')

    # Encode images
    images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')
    images = l2norm(images)
    
    return trng, [im], images
Exemple #39
0
def sort_vertexes_longest_edge_first(simplex):
    '''nD->nD Moves longest edge vertexes to the simplex vertex list beginning.'''
    # Find simplex edges lengths
    edge_lengths = []   # [(vertex_index, vertex_index, edge_length),]
    for i, j in permutations(range(len(simplex[:-1])+1), 2):
        if j > i:
            edge_lengths.append((i, j, l2norm(simplex[i][:-1], simplex[j][:-1])))


    # Get longest edge vertexes ids
    le_i, le_j, le_length = max(edge_lengths, key=lambda x: x[-1])

    # Move longest edge vertexes to simplex vertex list beginning
    vi = simplex[le_i]
    vj = simplex[le_j]
    simplex.remove(vi)
    simplex.remove(vj)
    simplex.insert(0, vj)
    simplex.insert(0, vi)
    return simplex
def build_model(tparams, options):                                                                                           
    """
    Computation graph for the model
    """
    opt_ret = dict()
    trng = RandomStreams(1234)

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    im = tensor.matrix('im', dtype='float32')
    con = tensor.matrix('con', dtype='int64')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # Word embedding (source)
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])

    # Encode sentences (source)
    proj = get_layer(options['encoder'])[1](tparams, emb, None, options,
                                            prefix='encoder',
                                            mask=mask)
    sents = proj[0][-1]
    sents = l2norm(sents)

    # Encode images (source)
    images = get_layer('ff')[1](tparams, im, options, prefix='ff_image', activ='linear')

    # Compute loss
    cost, updates = theano.scan(_step,
                                sequences=con,
                                outputs_info=tensor.alloc(0.),
                                non_sequences = [sents, images, options['margin']],
                                n_steps=con.shape[0],
                                profile=False,
                                strict=True)
    cost = cost[-1]
                               
    return trng, [x, mask, im, con], cost
Exemple #41
0
def find_mins_AB(simplex, L):
    ''' nD->nD
    Finds AB' and B'A intersection, where A, B are longest edge vertexes
    t - triangle (simplex).
    y - objective values for each vertex.
    Returns lower Lipschitz bound minimum for the first edge (made from first
    and second vetexes).
    '''
    dist = l2norm(nm(simplex[0]), nm(simplex[1]))
    x1 = a((0, simplex[0][-1]['obj'][0]))
    x2 = a((dist, simplex[0][-1]['obj'][0] - L*dist))
    x3 = a((dist, simplex[1][-1]['obj'][0]))
    x4 = a((0, simplex[1][-1]['obj'][0] - L*dist))

    ## 2D line intersection based on  http://mathworld.wolfram.com/Line-LineIntersection.html
    av = x2 - x1
    bv = x4 - x3
    cv = x3 - x1

    s = x1 + av * (np.cross(cv, bv) * (np.cross(av, bv))/( enorm(np.cross(av, bv))**2 ))
    X = a(simplex[0][:-1]) + s[0]/float(dist) * (a(simplex[1][:-1]) - a(simplex[0][:-1]))
    return [list(X) + [s[1]]]
def trainencoder(
      sources = ("image_vects", "word_vects")
    , sources_k = ("image_vects_k", "word_vects_k")
    , batch_size=128
    , embedding_dim=300
    , n_captions=5
    , n_sbu=None
    , separate_emb=False
    , test_size=1000 # per dataset
    , mode='dev'
    ):
    if mode=="coco120k+flickr38k":
        XYsplit_cum = ([], [], [], [])
        xyloaders = [
              "cocoXYFilenames(dataType='train2014')"
            , "cocoXYFilenames(dataType='val2014')"
            , "flickrXYFilenames(dataType='8k')"
            , "flickrXYFilenames(dataType='30k')"
            ]
        ntrains = [80000, 40000, 8000, 30000]

        for xyloader, ntrain in zip(xyloaders, ntrains):
            X, Y, _ = eval(xyloader)
            XYsplit = train_test_split(X, Y, train_size=ntrain)
            for i in range(len(XYsplit)):
                XYsplit_cum[i].extend(XYsplit[i])

        trX, teX, trY, teY = XYsplit_cum
    else:
        trX, teX, trY, teY = coco(mode=mode, n_captions=n_captions, test_size=test_size)
        if n_sbu:
            sbutrX, sbuteX, sbutrY, sbuteY = sbu(mode=mode, test_size=test_size)
            pairs = (
                  (trX, sbutrX)
                , (teX, sbuteX)
                , (trY, sbutrY)
                , (teY, sbuteY)
                )

            for coco_data, sbu_data in pairs:
                if isinstance(coco_data, list):
                    coco_data.extend(sbu_data)

    print("n_train: %d" % len(trX))
    print("n_test: %d" % len(teX))

    # # # # # # # # # # #
    # Modeling Building #
    # # # # # # # # # # #

    s = Encoder(
          image_feature_dim=4096
        , embedding_dim=embedding_dim
        , biases_init=Constant(0.)
        , weights_init=Uniform(width=0.08)
        )
    s.initialize()

    image_vects = tensor.matrix(sources[0]) # named to match the source name
    word_vects = tensor.tensor3(sources[1]) # named to match the source name
    image_vects_k = tensor.matrix(sources_k[0]) # named to match the contrastive source name
    word_vects_k = tensor.tensor3(sources_k[1]) # named to match the contrastive source name

    # image_vects.tag.test_value = np.zeros((2, 4096), dtype='float32')
    # word_vects.tag.test_value = np.zeros((2, 15, 50), dtype='float32')
    # image_vects_k.tag.test_value = np.zeros((2, 4096), dtype='float32')
    # word_vects_k.tag.test_value = np.zeros((2, 15, 50), dtype='float32')

    # learned image embedding, learned sentence embedding
    lim, ls = s.apply(image_vects, word_vects)

    # learned constrastive im embedding, learned contrastive s embedding
    lcim, lcs = s.apply(image_vects_k, word_vects_k)

    # identical cost code thanks to Ryan Kiros
    # https://github.com/youralien/skip-thoughts/blob/master/eval_rank.py
    lim = l2norm(lim)
    lcim = l2norm(lcim)
    ls = l2norm(ls)
    lcs = l2norm(lcs)

    margin = 0.2 # alpha term should not be more than 1

    cost_im = margin - (lim * ls).sum(axis=1) + (lim * lcs).sum(axis=1)
    cost_im = cost_im * (cost_im > 0.) # this is like the max(0, pairwise-ranking-loss)
    cost_im = cost_im.sum(0)

    cost_s = margin - (ls * lim).sum(axis=1) + (ls * lcim).sum(axis=1)
    cost_s = cost_s * (cost_s > 0.) # this is like max(0, pairwise-ranking-loss)
    cost_s = cost_s.sum(0)

    cost = cost_im + cost_s
    cost.name = "pairwise_ranking_loss"

    # function(s) to produce embedding
    if separate_emb:
        img_encoder = theano.function([image_vects], lim)
        txt_encoder = theano.function([word_vects], ls)
    f_emb = theano.function([image_vects, word_vects], [lim, ls])

    if n_sbu:
        sbuname = "sbu%d+" % n_sbu
    else:
        sbuname = ''
    name = "%sproject1.%s.jointembedder" % (sbuname, mode)
    savename = MODEL_FILES_DIR + name

    def save_function(self):
        if separate_emb:
            ModelIO.save(
                  img_encoder
                , savename + "_Img")
            ModelIO.save(
                  txt_encoder
                , savename + "_Txt")
        ModelIO.save(f_emb, savename)
        print "Similarity Embedding function(s) saved while training"

    def rank_function(stream):
        images, captions, _0, _1 = stream.get_epoch_iterator().next()
        image_embs, caption_embs = f_emb(images, captions)
        ModelEval.ImageSentenceRanking(image_embs, caption_embs)

    def rank_coco(self=None):
        # Get 1000 images / captions to test rank
        stream = DataETL.getFinalStream(teX, teY, sources=sources,
                            sources_k=sources_k, batch_size=test_size,
                            shuffle=True)
        print "COCO test"
        rank_function(stream)

    def rank_sbu(self=None):
        stream = DataETL.getFinalStream(sbuteX, sbuteY, sources=sources,
                            sources_k=sources_k, batch_size=test_size,
                            shuffle=True)
        print "SBU test"
        rank_function(stream)

    def rank_em(self=None):
        rank_coco()
        if n_sbu:
            rank_sbu()

    cg = ComputationGraph(cost)

    # # # # # # # # # # #
    # Modeling Training #
    # # # # # # # # # # #

    algorithm = GradientDescent(
          cost=cost
        , parameters=cg.parameters
        , step_rule=Adam(learning_rate=0.0002)
        )
    main_loop = MainLoop(
          model=Model(cost)
        , data_stream=DataETL.getFinalStream(trX, trY, sources=sources,
              sources_k=sources_k, batch_size=batch_size)
        , algorithm=algorithm
        , extensions=[
              DataStreamMonitoring(
                  [cost]
                , DataETL.getFinalStream(trX, trY, sources=sources,
                      sources_k=sources_k, batch_size=batch_size, shuffle=True)
                , prefix='train')
            , DataStreamMonitoring(
                  [cost]
                , DataETL.getFinalStream(teX, teY, sources=sources,
                      sources_k=sources_k, batch_size=batch_size, shuffle=True)
                , prefix='test')
            , UserFunc(save_function, after_epoch=True)
            , UserFunc(rank_em, after_epoch=True)
            , Printing()
            , LogToFile('logs/%s.csv' % name)
            ]
        )
    main_loop.run()