Ejemplo n.º 1
0
def vqa_eval_collate(inputs):
    (qids, input_ids, img_feats, img_pos_feats, attn_masks, targets) = map(
        list, unzip(inputs)
    )

    txt_lens = [i.size(0) for i in input_ids]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0)
    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    if targets[0] is None:
        targets = None
    else:
        targets = torch.stack(targets, dim=0)

    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {
        "qids": qids,
        "input_ids": input_ids,
        "position_ids": position_ids,
        "img_feat": img_feat,
        "img_pos_feat": img_pos_feat,
        "attn_masks": attn_masks,
        "gather_index": gather_index,
        "targets": targets,
    }
    return batch
Ejemplo n.º 2
0
def itm_rank_collate(inputs):
    (
        input_ids,
        img_feats,
        img_pos_feats,
        attn_masks,
    ) = map(list, unzip(concat(i for i in inputs)))

    txt_lens = [i.size(0) for i in input_ids]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = torch.arange(0, input_ids.size(1),
                                dtype=torch.long).unsqueeze(0)

    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    sample_size = len(inputs[0])
    assert all(sample_size == len(i) for i in inputs)

    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {
        'input_ids': input_ids,
        'position_ids': position_ids,
        'img_feat': img_feat,
        'img_pos_feat': img_pos_feat,
        'attn_masks': attn_masks,
        'gather_index': gather_index,
        'sample_size': sample_size
    }
    return batch
Ejemplo n.º 3
0
    def get_batch(self, i, img_ids):
        example = super().__getitem__(i)

        input_ids = example['input_ids']
        input_ids = self.txt_db.combine_inputs(input_ids)
        input_ids = input_ids.unsqueeze(0).expand(len(img_ids), -1).clone()
        position_ids = torch.arange(0, input_ids.size(1),
                                    dtype=torch.long).unsqueeze(0)

        # process image features (gt always first)
        img_feats, img_pos_feats, num_bbs = map(
            list, unzip(map(self._get_img_feat, img_ids)))
        img_feat = pad_tensors(img_feats, num_bbs)
        img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

        tl = input_ids.size(1)
        attn_masks = torch.zeros(len(img_ids), max(num_bbs) + tl).long()
        for i, nbb in enumerate(num_bbs):
            attn_masks.data[i, :tl + nbb].fill_(1)
        out_size = attn_masks.size(1)
        gather_index = get_gather_index([tl] * len(img_ids), num_bbs,
                                        len(img_ids), tl, out_size)

        batch = {
            'input_ids': input_ids,
            'position_ids': position_ids,
            'img_feat': img_feat,
            'img_pos_feat': img_pos_feat,
            'attn_masks': attn_masks,
            'gather_index': gather_index
        }
        return batch
Ejemplo n.º 4
0
    def collate(inputs):
        (video_inputs, all_clip_ranges, attn_masks_list, metas
         ) = map(list, unzip(inputs))

        all_attn_masks = list(concat(attn_masks_list))
        attn_mask = pad_sequence(all_attn_masks,
                                 batch_first=True, padding_value=0)
        batch = {'cap_attn_mask': attn_mask,
                 'clip_ranges': tuple(map(tuple, all_clip_ranges))}

        vid_batch = video_collate(video_inputs)
        batch.update(vid_batch)

        # meta
        vids, clip_ids, all_ts, all_gts = [], [], [], []
        for vid, cids, tss, gts in metas:
            for cid, ts, gt in zip(cids, tss, gts):
                vids.append(vid)
                clip_ids.append(int(cid))
                all_ts.append(ts)
                all_gts.append(gt)
        batch['vid_names'] = vids
        batch['clip_ids'] = clip_ids
        batch['all_ts'] = all_ts
        batch['gts'] = all_gts
        return batch
Ejemplo n.º 5
0
def video_qa_collate(inputs):
    (video_qa_inputs, qa_input_ids, qa_attn_masks,
     vids, target, ts_target) = map(
        list, unzip(inputs))
    all_video_qa_inputs = []
    all_target, all_ts_target = [], []
    all_qa_input_ids, all_qa_attn_masks = [], []
    for i in range(len(video_qa_inputs)):
        all_video_qa_inputs.extend(video_qa_inputs[i])
        all_qa_input_ids.extend(qa_input_ids[i])
        all_qa_attn_masks.extend(qa_attn_masks[i])
    for j in range(len(vids)):
        all_target.extend(target[j])
        all_ts_target.extend(ts_target[j])
    batch = video_collate(all_video_qa_inputs)

    targets = pad_sequence(
        all_target, batch_first=True, padding_value=-1)
    ts_targets = pad_sequence(
        all_ts_target, batch_first=True, padding_value=-1)
    input_ids, pos_ids, attn_masks =\
        txt_input_collate(all_qa_input_ids, all_qa_attn_masks)
    batch["targets"] = targets
    batch["ts_targets"] = ts_targets
    batch['qa_input_ids'] = input_ids
    batch['qa_pos_ids'] = pos_ids
    batch['qa_attn_masks'] = attn_masks
    return batch
Ejemplo n.º 6
0
def batchify_fn_extract_ptr(pad, data, cuda=True):
    source_lists, targets = tuple(map(list, unzip(data)))

    src_nums = list(map(len, source_lists))
    sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists))

    # PAD is -1 (dummy extraction index) for using sequence loss
    target = pad_batch_tensorize(targets, pad=-1, cuda=cuda)

    # to compile with the one sentence summary, change
    # remove_last = lambda tgt: tgt[:-1]
    # to
    def remove_last(tgt):
        tgt[1:] = tgt[:-1]
        return tgt

    tar_in = pad_batch_tensorize(
        list(map(remove_last, targets)),
        pad=-0,
        cuda=cuda  # use 0 here for feeding first conv sentence repr.
    )

    fw_args = (sources, src_nums, tar_in)
    loss_args = (target, )
    return fw_args, loss_args
Ejemplo n.º 7
0
def itm_collate(inputs):
    (input_ids, img_feats, img_pos_feats, attn_masks,
     targets) = map(list, unzip(inputs))

    txt_lens = [i.size(0) for i in input_ids]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = torch.arange(0, input_ids.size(1),
                                dtype=torch.long).unsqueeze(0)

    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    targets = torch.cat(targets, dim=0)
    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {
        'input_ids': input_ids,
        'position_ids': position_ids,
        'img_feat': img_feat,
        'img_pos_feat': img_pos_feat,
        'attn_masks': attn_masks,
        'gather_index': gather_index,
        'targets': targets
    }
    return batch
Ejemplo n.º 8
0
def batchify_fn_extract_ptr_entity(pad, data, cuda=True):
    source_lists, targets, clusters_infos = tuple(map(list, unzip(data)))
    (cluster_lists, cluster_wpos, cluster_spos) = list(zip(*clusters_infos))

    src_nums = list(map(len, source_lists))
    cl_nums = list(map(len, cluster_lists))
    cl_nums = [cl_num if cl_num != 0 else 1 for cl_num in cl_nums]
    sources = list(
        map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=5), source_lists))
    clusters = list(
        map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=4),
            cluster_lists))  # list of tensors, each tensor padded
    cluster_wpos = list(
        map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=4), cluster_wpos))
    cluster_spos = list(
        map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=4), cluster_spos))

    # PAD is -1 (dummy extraction index) for using sequence loss
    target = pad_batch_tensorize(targets, pad=-1, cuda=cuda)
    remove_last = lambda tgt: tgt[:-1]
    tar_in = pad_batch_tensorize(
        list(map(remove_last, targets)),
        pad=-0,
        cuda=cuda  # use 0 here for feeding first conv sentence repr.
    )

    fw_args = (sources, src_nums, tar_in, (clusters, cluster_wpos,
                                           cluster_spos), cl_nums)
    loss_args = (target, )
    return fw_args, loss_args
Ejemplo n.º 9
0
def mrm_collate_for_vcr(inputs):
    (input_ids, position_ids, type_ids, img_feats, img_pos_feats, attn_masks,
     img_masks) = map(list, unzip(inputs))

    txt_lens = [i.size(0) for i in input_ids]
    num_bbs = [f.size(0) for f in img_feats]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = pad_sequence(position_ids,
                                batch_first=True,
                                padding_value=0)
    type_ids = pad_sequence(type_ids, batch_first=True, padding_value=0)
    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    img_masks = pad_sequence(img_masks, batch_first=True, padding_value=0)

    batch_size = len(img_feats)
    num_bb = max(num_bbs)
    feat_dim = img_feats[0].size(1)
    pos_dim = img_pos_feats[0].size(1)
    img_feat = torch.zeros(batch_size, num_bb, feat_dim)
    img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)
    for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)):
        len_ = im.size(0)
        img_feat.data[i, :len_, :] = im.data
        img_pos_feat.data[i, :len_, :] = pos.data

    return (input_ids, position_ids, type_ids, txt_lens, img_feat,
            img_pos_feat, num_bbs, attn_masks, img_masks)
Ejemplo n.º 10
0
 def parse_df(df):
     identifier_set, type_set = extract_fake_c_header_identifier()
     clex = BufferedCLex(error_func=lambda self, msg, line, column: None,
                         on_lbrace_func=lambda: None,
                         on_rbrace_func=lambda: None,
                         type_lookup_func=lambda typ: None)
     clex.build()
     BEGIN, END, UNK = ["<BEGIN>", "<END>", "<UNK>"]
     from embedding.wordembedding import load_vocabulary
     vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK)
     print("the size of predefined_identifer:{}".format(len(identifier_set)))
     print("the size of typeset:{}".format(len(type_set)))
     parse_fn = monitored_slk_parse(clex=clex, predefined_identifer=identifier_set, predefined_typename=type_set,
                                    vocabulary=vocabulary)
     parsed_code = show_process_map(parse_fn, df['code'],
                                    error_default_value=tuple([None, ] * 7))
     parsed_code = unzip(parsed_code)
     df['parse_tree'] = list(parsed_code[0])
     df['tokens'] = list(parsed_code[1])
     df['consistent_identifier'] = list(parsed_code[2])
     df['identifier_scope_index'] = list(parsed_code[3])
     df['is_identifier'] = list(parsed_code[4])
     df['max_scope_list'] = list(parsed_code[5])
     df['consistent_typename'] = list(parsed_code[6])
     return df
Ejemplo n.º 11
0
def coll_fn(data):
    source_lists, target_lists = unzip(data)
    # NOTE: independent filtering works because
    #       source and targets are matched properly by the Dataset
    sources = list(filter(bool, concat(source_lists)))
    targets = list(filter(bool, concat(target_lists)))
    assert all(sources) and all(targets)
    return sources, targets
Ejemplo n.º 12
0
def get_ext_word2id(unk, word2id, batch):
    sources, targets = map(list, unzip(batch))
    ext_word2id = dict(word2id)
    for source in sources:
        for word in source:
            if word not in word2id:
                ext_word2id[word] = len(ext_word2id)
    return ext_word2id
Ejemplo n.º 13
0
def convert_batch(unk, word2id, batch):
    #给没出现的字替换为unknown,然后对于原来输入的article和abs,生成读应的新版
    sources, targets = map(list, unzip(batch))
    sources = conver2id(unk, word2id, sources)
    targets = conver2id(unk, word2id, targets)
    batch = list(zip(sources, targets))
    #还是一个sources的list的list
    return batch
Ejemplo n.º 14
0
def coll_fn(data):
    source_lists, target_lists = unzip(data)
    # NOTE: independent filtering works because
    #       source and targets are matched properly by the Dataset
    sources = list(filter(bool, concat(source_lists)))
    targets = list(filter(bool, concat(target_lists)))
    assert all(sources) and all(targets)
    return sources, targets
Ejemplo n.º 15
0
def generate_data(max_index=10, max_length=10, min_length=5, number=100000):
    df = pd.DataFrame()
    df['x'] = [generate_one_seq(max_index, max_length, min_length) for _ in range(number)]
    target = [create_target(t) for t in df['x']]
    y, transform_id = [list(t) for t in unzip(target)]
    df['y'] = y
    df['transform_id'] = transform_id
    return df
Ejemplo n.º 16
0
 def parse_df(df):
     monitor = MonitoredParser()
     parsed_code = show_process_map(monitor.parse_get_production_list_and_token_list, df['code'],
                                    error_default_value=(None, None, None))
     parsed_code = unzip(parsed_code)
     df['parse_tree'] = list(parsed_code[0])
     df['ast'] = list(parsed_code[1])
     df['tokens'] = list(parsed_code[2])
     return df
Ejemplo n.º 17
0
def mlm_collate_for_vcr(inputs):
    (input_ids, txt_type_ids, img_feats, img_pos_feats, attn_masks,
     txt_labels) = map(list, unzip(inputs))
    batch = vcr_pretrain_collate(input_ids, txt_type_ids, img_feats,
                                 img_pos_feats, attn_masks)
    txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1)

    batch['txt_labels'] = txt_labels
    return batch
Ejemplo n.º 18
0
def coll_fn(data):
    source_lists, target_lists = unzip(data)

    sources = [[source for source in article_source] for article_source in list(source_lists)]
    targets = [[target for target in article_target] for article_target in list(target_lists)]

    assert all(sources) and all(targets)
    #现在是两层list,每个文章一个list,list里面包含 n个句子,  
    return sources, targets
Ejemplo n.º 19
0
def coll_fn_graph(data):
    source_lists, target_lists, nodes, edges, subgraphs, paras = unzip(data)
    # NOTE: independent filtering works because
    #       source and targets are matched properly by the Dataset
    sources = list(filter(bool, source_lists))
    #sources = source_lists
    targets = list(filter(bool, concat(target_lists)))
    assert all(sources) and all(targets)
    return sources, targets, nodes, edges, subgraphs, paras
Ejemplo n.º 20
0
Archivo: mrm.py Proyecto: zmykevin/UC2
def xlmr_mrfr_collate(inputs):
    """
    Return:
    - input_ids    : (n, max_L), i.e., [cls, wd, wd, ..., sep, 1, 1], 1s padded
    - position_ids : (n, max_L)
    - txt_lens     : list of [input_len]
    - img_feat     : (n, max_num_bb, d)
    - img_pos_feat : (n, max_num_bb, 7)
    - num_bbs      : list of [num_bb]
    - attn_masks   : (n, max_{L + num_bb}), ie., [1, 1, ..., 0, 0, 1, 1]
    - img_masks    : (n, max_num_bb) between {0, 1}
    """
    (
        input_ids,
        img_feats,
        img_pos_feats,
        attn_masks,
        img_masks,
        img_mask_tgts,
    ) = map(list, unzip(inputs))

    txt_lens = [i.size(0) for i in input_ids]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=1)
    position_ids = torch.arange(0, input_ids.size(1),
                                dtype=torch.long).unsqueeze(0)

    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    # mask features
    img_masks = pad_sequence(img_masks, batch_first=True, padding_value=0)
    feat_targets = _get_feat_target(img_feat, img_masks)
    img_feat = _mask_img_feat(img_feat, img_masks)
    img_mask_tgt = pad_sequence(img_mask_tgts,
                                batch_first=True,
                                padding_value=0)

    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {
        'input_ids': input_ids,
        'position_ids': position_ids,
        'img_feat': img_feat,
        'img_pos_feat': img_pos_feat,
        'attn_masks': attn_masks,
        'gather_index': gather_index,
        'feat_targets': feat_targets,
        'img_masks': img_masks,
        'img_mask_tgt': img_mask_tgt
    }
    return batch
Ejemplo n.º 21
0
def xlmr_mmxlm_softlabel_collate(inputs):
    """
    Return:
    :input_ids    (n, max_L) padded with 1
    :position_ids (n, max_L) padded with 0 # Mingyang: doesn't matter for Roberta, as we will regenerate position ids. 
    :txt_lens     list of [txt_len]
    :img_feat     (n, max_num_bb, feat_dim)
    :img_pos_feat (n, max_num_bb, 7)
    :num_bbs      list of [num_bb]
    :attn_masks   (n, max_{L + num_bb}) padded with 0
    :img_masks    (n, num_bb)
    :tgt_masks    (n, max_{L+num_bb}) padded with 0
    :img_token_soft_labels   (n, num_bb, label_token_size) 
    """
    (input_ids, img_feats, img_pos_feats, attn_masks, img_masks, tgt_masks, img_token_soft_labels
     ) = map(list, unzip(inputs)) #added by Mingyang Zhou

    # text batches
    txt_lens = [i.size(0) for i in input_ids]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=1)
    #txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1)
    position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long
                                ).unsqueeze(0)

    # image batches
    num_bbs = [f.size(0) for f in img_feats]
    l_num_bbs = [tl+il for tl,il in zip(txt_lens, num_bbs)] #demonstrate the l+num bbs
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)
    
#     for soft_label,l in zip(tgt_token_soft_labels,l):
#         print(soft_label.size())
#         print(l)
    #tgt_token_soft_labels = pad_tensors(tgt_token_soft_labels, l_num_bbs) #get the target token  soft labels  
    img_token_soft_label = pad_tensors(img_token_soft_labels, num_bbs)
    #Added img_masks
    img_masks = pad_sequence(img_masks, batch_first=True, padding_value=0)
    tgt_masks = pad_sequence(tgt_masks, batch_first=True, padding_value=0)
    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    
    img_feat = _mask_img_feat(img_feat, img_masks) #masked the corresponding img_feat
    label_targets = _get_targets(img_masks, img_token_soft_label) #Get the label_targets
    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {'input_ids': input_ids,
             'position_ids': position_ids,
             'img_feat': img_feat,
             'img_pos_feat': img_pos_feat,
             'attn_masks': attn_masks,
             'gather_index': gather_index,
             'img_masks': img_masks,
             'tgt_masks': tgt_masks,
             'label_targets': label_targets}
    return batch
Ejemplo n.º 22
0
    def __init__(self, items, dict_remove=None):
        def remove(k, selfref=ref(self), dict_remove=dict_remove):
            self = selfref()
            if self is not None and dict_remove is not None:
                dict_remove(self)

        self._items, self._selectors = unzip(
            self._try_ref(item, remove) for item in items)
        self._items = tuple(self._items)
        self._selectors = tuple(self._selectors)
Ejemplo n.º 23
0
    def __init__(self, items, dict_remove=None):
        def remove(k, selfref=ref(self), dict_remove=dict_remove):
            self = selfref()
            if self is not None and dict_remove is not None:
                dict_remove(self)

        self._items, self._selectors = unzip(self._try_ref(item, remove)
                                             for item in items)
        self._items = tuple(self._items)
        self._selectors = tuple(self._selectors)
Ejemplo n.º 24
0
def convert_batch_gat(unk, word2id, batch):
    sources, targets, node_infos, edge_infos = list(map(list, unzip(batch)))
    nodewords, nodelengths, sum_worhies, word_freq_feat, nodefreq = list(
        unzip(node_infos))
    relations, rlengths, triples = list(unzip(edge_infos))
    ext_word2id = dict(word2id)
    for source in sources:
        for word in source:
            if word not in ext_word2id:
                ext_word2id[word] = len(ext_word2id)
    src_exts = conver2id(unk, ext_word2id, sources)
    sources = conver2id(unk, word2id, sources)
    tar_ins = conver2id(unk, word2id, targets)
    targets = conver2id(unk, ext_word2id, targets)
    batch = list(
        zip(sources, src_exts, tar_ins, targets, nodewords, nodelengths,
            sum_worhies, word_freq_feat, nodefreq, relations, rlengths,
            triples))
    return batch
Ejemplo n.º 25
0
def coll_fn(data):

    label_lists, concat_text_lists= unzip(data)
    # print(type(label_lists))
    # print(type(concat_text_lists))
    labels = list(label_lists)
    concat_texts = list(concat_text_lists)

    assert all(concat_texts)
    return labels, concat_texts
Ejemplo n.º 26
0
def batchify_fn(pad, data, cuda=True):
    source_lists, targets = tuple(map(list, unzip(data)))

    sources = pad_batch_tensorize(inputs=list(concat(source_lists)), pad=pad, cuda=cuda)
    tensor_type = torch.cuda.LongTensor if cuda else torch.LongTensor
    target = tensor_type(list(concat(targets)))

    fw_args = (sources,)
    loss_args = (target,)
    return fw_args, loss_args
Ejemplo n.º 27
0
def vcr_eval_collate(inputs):
    (qids, input_ids, position_ids, txt_lens, txt_type_ids, img_feats,
     img_pos_feats, num_bbs, attn_masks, qa_targets, qar_targets,
     obj_targets) = map(list, unzip(inputs))

    all_num_bbs, all_img_feats, all_img_pos_feats = ([], [], [])
    all_txt_lens, all_input_ids, all_attn_masks, all_position_ids,\
        all_txt_type_ids = (
            [], [], [], [], [])
    # all_qa_targets = qa_targets
    # all_qar_targets = qar_targets
    all_obj_targets = []
    for i in range(len(num_bbs)):
        all_input_ids += input_ids[i]
        all_position_ids += position_ids[i]
        all_txt_lens += txt_lens[i]
        all_img_feats += img_feats[i]
        all_img_pos_feats += img_pos_feats[i]
        all_num_bbs += num_bbs[i]
        all_attn_masks += attn_masks[i]
        all_txt_type_ids += txt_type_ids[i]
        all_obj_targets += obj_targets[i]

    all_input_ids = pad_sequence(all_input_ids,
                                 batch_first=True,
                                 padding_value=0)
    all_position_ids = pad_sequence(all_position_ids,
                                    batch_first=True,
                                    padding_value=0)
    all_txt_type_ids = pad_sequence(all_txt_type_ids,
                                    batch_first=True,
                                    padding_value=0)
    all_attn_masks = pad_sequence(all_attn_masks,
                                  batch_first=True,
                                  padding_value=0)
    all_obj_targets = pad_sequence(all_obj_targets,
                                   batch_first=True,
                                   padding_value=0)
    all_qa_targets = torch.stack(qa_targets, dim=0)
    all_qar_targets = torch.stack(qar_targets, dim=0)

    batch_size = len(all_img_feats)
    num_bb = max(all_num_bbs)
    feat_dim = all_img_feats[0].size(1)
    pos_dim = all_img_pos_feats[0].size(1)
    all_img_feat = torch.zeros(batch_size, num_bb, feat_dim)
    all_img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim)
    for i, (im, pos) in enumerate(zip(all_img_feats, all_img_pos_feats)):
        len_ = im.size(0)
        all_img_feat.data[i, :len_, :] = im.data
        all_img_pos_feat.data[i, :len_, :] = pos.data

    return (qids, all_input_ids, all_position_ids, all_txt_lens,
            all_txt_type_ids, all_img_feat, all_img_pos_feat, all_num_bbs,
            all_attn_masks, all_qa_targets, all_qar_targets, all_obj_targets)
Ejemplo n.º 28
0
def mlm_collate(inputs):
    """
    Return:
    n = batch-size
    :input_ids    (n, max_L) padded with 0
    :position_ids (n, max_L) padded with 0
    :txt_lens     list of [txt_len]
    :img_feat     (n, max_num_bb, feat_dim)
    :img_pos_feat (n, max_num_bb, 7)
    :num_bbs      list of [num_bb]
    :attn_masks   (n, max_{L + num_bb}) padded with 0
    :attn_masks_txt   (n, max_L) padded with 0
    :attn_masks_img   (n, max_num_bb) padded with 0
    :txt_labels   (n, max_L) padded with -1
    """
    (input_ids, img_feats, img_pos_feats, attn_masks, txt_labels,
     attn_masks_txt, attn_masks_img) = map(list, unzip(inputs))

    # text batches
    txt_lens = [i.size(0) for i in input_ids]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1)
    position_ids = torch.arange(0, input_ids.size(1),
                                dtype=torch.long).unsqueeze(0)

    # image batches
    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    # lrc add for two-flow
    attn_masks_txt = pad_sequence(attn_masks_txt,
                                  batch_first=True,
                                  padding_value=0)
    attn_masks_img = pad_sequence(attn_masks_img,
                                  batch_first=True,
                                  padding_value=0)

    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)  # = attn_masks.size()[1]
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {
        'input_ids': input_ids,
        'position_ids': position_ids,
        'img_feat': img_feat,
        'img_pos_feat': img_pos_feat,
        'attn_masks': attn_masks,
        'attn_masks_txt': attn_masks_txt,
        'attn_masks_img': attn_masks_img,
        'gather_index': gather_index,
        'txt_labels': txt_labels
    }
    return batch
Ejemplo n.º 29
0
def itm_ot_collate(inputs):
    (input_ids, img_feats, img_pos_feats, attn_masks, targets, attn_masks_txt,
     attn_masks_img) = map(list, unzip(inputs))

    txt_lens = [i.size(0) for i in input_ids]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = torch.arange(0, input_ids.size(1),
                                dtype=torch.long).unsqueeze(0)

    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    # lrc add start
    attn_masks_txt = pad_sequence(attn_masks_txt,
                                  batch_first=True,
                                  padding_value=0)
    attn_masks_img = pad_sequence(attn_masks_img,
                                  batch_first=True,
                                  padding_value=0)

    targets = torch.cat(targets, dim=0)
    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    # OT inputs
    max_tl = max(txt_lens)
    max_nbb = max(num_bbs)
    ot_scatter = _compute_ot_scatter(txt_lens, max_tl, attn_masks.size(1))
    txt_pad = _compute_pad(txt_lens, max_tl)
    img_pad = _compute_pad(num_bbs, max_nbb)
    ot_inputs = {
        'ot_scatter': ot_scatter,
        'scatter_max': ot_scatter.max().item(),
        'txt_pad': txt_pad,
        'img_pad': img_pad
    }

    batch = {
        'input_ids': input_ids,
        'position_ids': position_ids,
        'img_feat': img_feat,
        'img_pos_feat': img_pos_feat,
        'attn_masks': attn_masks,
        'attn_masks_txt': attn_masks_txt,
        'attn_masks_img': attn_masks_img,
        'gather_index': gather_index,
        'targets': targets,
        'ot_inputs': ot_inputs
    }

    return batch
Ejemplo n.º 30
0
    def parse_input(batch_data, do_sample=False):
        input_seq = to_cuda(
            torch.LongTensor(
                PaddedList(batch_data['error_token_ids'], fill_value=0)))
        input_line_length = to_cuda(
            torch.LongTensor(PaddedList(batch_data['error_line_length'])))
        input_line_token_length = to_cuda(
            torch.LongTensor(PaddedList(
                batch_data['error_line_token_length'])))

        input_length = to_cuda(
            torch.LongTensor(PaddedList(batch_data['error_token_length'])))
        if not use_ast:
            adj_matrix = to_cuda(torch.LongTensor(batch_data['adj']))
        else:
            adjacent_tuple = [[[i] + tt for tt in t]
                              for i, t in enumerate(batch_data['adj'])]
            adjacent_tuple = [
                list(t) for t in unzip(more_itertools.flatten(adjacent_tuple))
            ]
            size = max(batch_data['error_token_length'])
            # print("max length in this batch:{}".format(size))
            adjacent_tuple = torch.LongTensor(adjacent_tuple)
            adjacent_values = torch.ones(adjacent_tuple.shape[1]).long()
            adjacent_size = torch.Size(
                [len(batch_data['error_token_length']), size, size])
            info('batch_data input_length: ' +
                 str(batch_data['error_token_length']))
            info('size: ' + str(size))
            info('adjacent_tuple: ' + str(adjacent_tuple.shape))
            info('adjacent_size: ' + str(adjacent_size))
            adj_matrix = to_cuda(
                torch.sparse.LongTensor(
                    adjacent_tuple,
                    adjacent_values,
                    adjacent_size,
                ).float().to_dense())

        if not do_sample:
            target_error_position = to_cuda(
                torch.LongTensor(PaddedList(batch_data['error_line'])))
            target_seq = to_cuda(
                torch.LongTensor(
                    PaddedList(batch_data['target_line_ids'],
                               fill_value=ignore_id)))
            target_length = to_cuda(
                torch.LongTensor(PaddedList(batch_data['target_line_length'])))
        else:
            target_error_position = None
            target_seq = None
            target_length = None

        return input_seq, input_line_length, input_line_token_length, input_length, adj_matrix, target_error_position, target_seq, target_length
Ejemplo n.º 31
0
    def collate(inputs):
        video_inputs, all_clip_ranges, cap_inputs = map(list, unzip(inputs))

        (all_input_ids, all_tgt_ids, all_attn_masks
         ) = map(list, unzip(concat(outs for outs in cap_inputs)))
        input_ids = pad_sequence(all_input_ids,
                                 batch_first=True, padding_value=1)
        position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long
                                    ).unsqueeze(0)
        tgt_ids = pad_sequence(all_tgt_ids, batch_first=True, padding_value=-1)
        attn_mask = pad_sequence(all_attn_masks,
                                 batch_first=True, padding_value=0)
        batch = {'cap_input_ids': input_ids,
                 'cap_pos_ids': position_ids,
                 'cap_tgt_ids': tgt_ids,
                 'cap_attn_mask': attn_mask,
                 'clip_ranges': tuple(map(tuple, all_clip_ranges))}

        vid_batch = video_collate(video_inputs)
        batch.update(vid_batch)
        return batch
Ejemplo n.º 32
0
def batchify_fn_extract_ff(pad, data, cuda=True):
    source_lists, targets = tuple(map(list, unzip(data)))

    src_nums = list(map(len, source_lists))
    sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists))

    tensor_type = torch.cuda.FloatTensor if cuda else torch.FloatTensor
    target = tensor_type(list(concat(targets)))

    fw_args = (sources, src_nums)
    loss_args = (target,)
    return fw_args, loss_args
Ejemplo n.º 33
0
def batchify_fn_extract_ff(pad, data, cuda=True):
    source_lists, targets = tuple(map(list, unzip(data)))

    src_nums = list(map(len, source_lists))
    sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists))

    tensor_type = torch.cuda.FloatTensor if cuda else torch.FloatTensor
    target = tensor_type(list(concat(targets)))

    fw_args = (sources, src_nums)
    loss_args = (target, )
    return fw_args, loss_args
Ejemplo n.º 34
0
def convert_batch_copy(unk, word2id, batch):
    sources, targets = map(list, unzip(batch))
    ext_word2id = dict(word2id)
    for source in sources:
        for word in source:
            if word not in ext_word2id:
                ext_word2id[word] = len(ext_word2id)
    src_exts = conver2id(unk, ext_word2id, sources)
    sources = conver2id(unk, word2id, sources)
    tar_ins = conver2id(unk, word2id, targets)
    targets = conver2id(unk, ext_word2id, targets)
    batch = list(zip(sources, src_exts, tar_ins, targets))
    return batch
Ejemplo n.º 35
0
def batchify_fn(pad, start, end, data, cuda=True):
    sources, targets = tuple(map(list, unzip(data)))

    src_lens = [len(src) for src in sources]
    tar_ins = [[start] + tgt for tgt in targets]
    targets = [tgt + [end] for tgt in targets]

    source = pad_batch_tensorize(sources, pad, cuda)
    tar_in = pad_batch_tensorize(tar_ins, pad, cuda)
    target = pad_batch_tensorize(targets, pad, cuda)

    fw_args = (source, src_lens, tar_in)
    loss_args = (target, )
    return fw_args, loss_args
Ejemplo n.º 36
0
def batchify_fn_extract_ptr(pad, data, cuda=True):
    source_lists, targets = tuple(map(list, unzip(data)))

    src_nums = list(map(len, source_lists))
    sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists))

    # PAD is -1 (dummy extraction index) for using sequence loss
    target = pad_batch_tensorize(targets, pad=-1, cuda=cuda)
    remove_last = lambda tgt: tgt[:-1]
    tar_in = pad_batch_tensorize(
        list(map(remove_last, targets)),
        pad=-0, cuda=cuda # use 0 here for feeding first conv sentence repr.
    )

    fw_args = (sources, src_nums, tar_in)
    loss_args = (target, )
    return fw_args, loss_args
Ejemplo n.º 37
0
def batchify_fn_copy(pad, start, end, data, cuda=True):
    sources, ext_srcs, tar_ins, targets = tuple(map(list, unzip(data)))

    src_lens = [len(src) for src in sources]
    sources = [src for src in sources]
    ext_srcs = [ext for ext in ext_srcs]

    tar_ins = [[start] + tgt for tgt in tar_ins]
    targets = [tgt + [end] for tgt in targets]

    source = pad_batch_tensorize(sources, pad, cuda)
    tar_in = pad_batch_tensorize(tar_ins, pad, cuda)
    target = pad_batch_tensorize(targets, pad, cuda)
    ext_src = pad_batch_tensorize(ext_srcs, pad, cuda)

    ext_vsize = ext_src.max().item() + 1
    fw_args = (source, src_lens, tar_in, ext_src, ext_vsize)
    loss_args = (target, )
    return fw_args, loss_args
Ejemplo n.º 38
0
def convert_batch(unk, word2id, batch):
    sources, targets = unzip(batch)
    sources = conver2id(unk, word2id, sources)
    targets = conver2id(unk, word2id, targets)
    batch = list(zip(sources, targets))
    return batch