def vqa_eval_collate(inputs): (qids, input_ids, img_feats, img_pos_feats, attn_masks, targets) = map( list, unzip(inputs) ) txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) if targets[0] is None: targets = None else: targets = torch.stack(targets, dim=0) num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = { "qids": qids, "input_ids": input_ids, "position_ids": position_ids, "img_feat": img_feat, "img_pos_feat": img_pos_feat, "attn_masks": attn_masks, "gather_index": gather_index, "targets": targets, } return batch
def itm_rank_collate(inputs): ( input_ids, img_feats, img_pos_feats, attn_masks, ) = map(list, unzip(concat(i for i in inputs))) txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) sample_size = len(inputs[0]) assert all(sample_size == len(i) for i in inputs) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = { 'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'sample_size': sample_size } return batch
def get_batch(self, i, img_ids): example = super().__getitem__(i) input_ids = example['input_ids'] input_ids = self.txt_db.combine_inputs(input_ids) input_ids = input_ids.unsqueeze(0).expand(len(img_ids), -1).clone() position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) # process image features (gt always first) img_feats, img_pos_feats, num_bbs = map( list, unzip(map(self._get_img_feat, img_ids))) img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) tl = input_ids.size(1) attn_masks = torch.zeros(len(img_ids), max(num_bbs) + tl).long() for i, nbb in enumerate(num_bbs): attn_masks.data[i, :tl + nbb].fill_(1) out_size = attn_masks.size(1) gather_index = get_gather_index([tl] * len(img_ids), num_bbs, len(img_ids), tl, out_size) batch = { 'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index } return batch
def collate(inputs): (video_inputs, all_clip_ranges, attn_masks_list, metas ) = map(list, unzip(inputs)) all_attn_masks = list(concat(attn_masks_list)) attn_mask = pad_sequence(all_attn_masks, batch_first=True, padding_value=0) batch = {'cap_attn_mask': attn_mask, 'clip_ranges': tuple(map(tuple, all_clip_ranges))} vid_batch = video_collate(video_inputs) batch.update(vid_batch) # meta vids, clip_ids, all_ts, all_gts = [], [], [], [] for vid, cids, tss, gts in metas: for cid, ts, gt in zip(cids, tss, gts): vids.append(vid) clip_ids.append(int(cid)) all_ts.append(ts) all_gts.append(gt) batch['vid_names'] = vids batch['clip_ids'] = clip_ids batch['all_ts'] = all_ts batch['gts'] = all_gts return batch
def video_qa_collate(inputs): (video_qa_inputs, qa_input_ids, qa_attn_masks, vids, target, ts_target) = map( list, unzip(inputs)) all_video_qa_inputs = [] all_target, all_ts_target = [], [] all_qa_input_ids, all_qa_attn_masks = [], [] for i in range(len(video_qa_inputs)): all_video_qa_inputs.extend(video_qa_inputs[i]) all_qa_input_ids.extend(qa_input_ids[i]) all_qa_attn_masks.extend(qa_attn_masks[i]) for j in range(len(vids)): all_target.extend(target[j]) all_ts_target.extend(ts_target[j]) batch = video_collate(all_video_qa_inputs) targets = pad_sequence( all_target, batch_first=True, padding_value=-1) ts_targets = pad_sequence( all_ts_target, batch_first=True, padding_value=-1) input_ids, pos_ids, attn_masks =\ txt_input_collate(all_qa_input_ids, all_qa_attn_masks) batch["targets"] = targets batch["ts_targets"] = ts_targets batch['qa_input_ids'] = input_ids batch['qa_pos_ids'] = pos_ids batch['qa_attn_masks'] = attn_masks return batch
def batchify_fn_extract_ptr(pad, data, cuda=True): source_lists, targets = tuple(map(list, unzip(data))) src_nums = list(map(len, source_lists)) sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists)) # PAD is -1 (dummy extraction index) for using sequence loss target = pad_batch_tensorize(targets, pad=-1, cuda=cuda) # to compile with the one sentence summary, change # remove_last = lambda tgt: tgt[:-1] # to def remove_last(tgt): tgt[1:] = tgt[:-1] return tgt tar_in = pad_batch_tensorize( list(map(remove_last, targets)), pad=-0, cuda=cuda # use 0 here for feeding first conv sentence repr. ) fw_args = (sources, src_nums, tar_in) loss_args = (target, ) return fw_args, loss_args
def itm_collate(inputs): (input_ids, img_feats, img_pos_feats, attn_masks, targets) = map(list, unzip(inputs)) txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) targets = torch.cat(targets, dim=0) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = { 'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'targets': targets } return batch
def batchify_fn_extract_ptr_entity(pad, data, cuda=True): source_lists, targets, clusters_infos = tuple(map(list, unzip(data))) (cluster_lists, cluster_wpos, cluster_spos) = list(zip(*clusters_infos)) src_nums = list(map(len, source_lists)) cl_nums = list(map(len, cluster_lists)) cl_nums = [cl_num if cl_num != 0 else 1 for cl_num in cl_nums] sources = list( map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=5), source_lists)) clusters = list( map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=4), cluster_lists)) # list of tensors, each tensor padded cluster_wpos = list( map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=4), cluster_wpos)) cluster_spos = list( map(pad_batch_tensorize(pad=pad, cuda=cuda, max_num=4), cluster_spos)) # PAD is -1 (dummy extraction index) for using sequence loss target = pad_batch_tensorize(targets, pad=-1, cuda=cuda) remove_last = lambda tgt: tgt[:-1] tar_in = pad_batch_tensorize( list(map(remove_last, targets)), pad=-0, cuda=cuda # use 0 here for feeding first conv sentence repr. ) fw_args = (sources, src_nums, tar_in, (clusters, cluster_wpos, cluster_spos), cl_nums) loss_args = (target, ) return fw_args, loss_args
def mrm_collate_for_vcr(inputs): (input_ids, position_ids, type_ids, img_feats, img_pos_feats, attn_masks, img_masks) = map(list, unzip(inputs)) txt_lens = [i.size(0) for i in input_ids] num_bbs = [f.size(0) for f in img_feats] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) position_ids = pad_sequence(position_ids, batch_first=True, padding_value=0) type_ids = pad_sequence(type_ids, batch_first=True, padding_value=0) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) img_masks = pad_sequence(img_masks, batch_first=True, padding_value=0) batch_size = len(img_feats) num_bb = max(num_bbs) feat_dim = img_feats[0].size(1) pos_dim = img_pos_feats[0].size(1) img_feat = torch.zeros(batch_size, num_bb, feat_dim) img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim) for i, (im, pos) in enumerate(zip(img_feats, img_pos_feats)): len_ = im.size(0) img_feat.data[i, :len_, :] = im.data img_pos_feat.data[i, :len_, :] = pos.data return (input_ids, position_ids, type_ids, txt_lens, img_feat, img_pos_feat, num_bbs, attn_masks, img_masks)
def parse_df(df): identifier_set, type_set = extract_fake_c_header_identifier() clex = BufferedCLex(error_func=lambda self, msg, line, column: None, on_lbrace_func=lambda: None, on_rbrace_func=lambda: None, type_lookup_func=lambda typ: None) clex.build() BEGIN, END, UNK = ["<BEGIN>", "<END>", "<UNK>"] from embedding.wordembedding import load_vocabulary vocabulary = load_vocabulary(get_token_vocabulary, get_vocabulary_id_map_with_keyword, [BEGIN], [END], UNK) print("the size of predefined_identifer:{}".format(len(identifier_set))) print("the size of typeset:{}".format(len(type_set))) parse_fn = monitored_slk_parse(clex=clex, predefined_identifer=identifier_set, predefined_typename=type_set, vocabulary=vocabulary) parsed_code = show_process_map(parse_fn, df['code'], error_default_value=tuple([None, ] * 7)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['tokens'] = list(parsed_code[1]) df['consistent_identifier'] = list(parsed_code[2]) df['identifier_scope_index'] = list(parsed_code[3]) df['is_identifier'] = list(parsed_code[4]) df['max_scope_list'] = list(parsed_code[5]) df['consistent_typename'] = list(parsed_code[6]) return df
def coll_fn(data): source_lists, target_lists = unzip(data) # NOTE: independent filtering works because # source and targets are matched properly by the Dataset sources = list(filter(bool, concat(source_lists))) targets = list(filter(bool, concat(target_lists))) assert all(sources) and all(targets) return sources, targets
def get_ext_word2id(unk, word2id, batch): sources, targets = map(list, unzip(batch)) ext_word2id = dict(word2id) for source in sources: for word in source: if word not in word2id: ext_word2id[word] = len(ext_word2id) return ext_word2id
def convert_batch(unk, word2id, batch): #给没出现的字替换为unknown,然后对于原来输入的article和abs,生成读应的新版 sources, targets = map(list, unzip(batch)) sources = conver2id(unk, word2id, sources) targets = conver2id(unk, word2id, targets) batch = list(zip(sources, targets)) #还是一个sources的list的list return batch
def generate_data(max_index=10, max_length=10, min_length=5, number=100000): df = pd.DataFrame() df['x'] = [generate_one_seq(max_index, max_length, min_length) for _ in range(number)] target = [create_target(t) for t in df['x']] y, transform_id = [list(t) for t in unzip(target)] df['y'] = y df['transform_id'] = transform_id return df
def parse_df(df): monitor = MonitoredParser() parsed_code = show_process_map(monitor.parse_get_production_list_and_token_list, df['code'], error_default_value=(None, None, None)) parsed_code = unzip(parsed_code) df['parse_tree'] = list(parsed_code[0]) df['ast'] = list(parsed_code[1]) df['tokens'] = list(parsed_code[2]) return df
def mlm_collate_for_vcr(inputs): (input_ids, txt_type_ids, img_feats, img_pos_feats, attn_masks, txt_labels) = map(list, unzip(inputs)) batch = vcr_pretrain_collate(input_ids, txt_type_ids, img_feats, img_pos_feats, attn_masks) txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1) batch['txt_labels'] = txt_labels return batch
def coll_fn(data): source_lists, target_lists = unzip(data) sources = [[source for source in article_source] for article_source in list(source_lists)] targets = [[target for target in article_target] for article_target in list(target_lists)] assert all(sources) and all(targets) #现在是两层list,每个文章一个list,list里面包含 n个句子, return sources, targets
def coll_fn_graph(data): source_lists, target_lists, nodes, edges, subgraphs, paras = unzip(data) # NOTE: independent filtering works because # source and targets are matched properly by the Dataset sources = list(filter(bool, source_lists)) #sources = source_lists targets = list(filter(bool, concat(target_lists))) assert all(sources) and all(targets) return sources, targets, nodes, edges, subgraphs, paras
def xlmr_mrfr_collate(inputs): """ Return: - input_ids : (n, max_L), i.e., [cls, wd, wd, ..., sep, 1, 1], 1s padded - position_ids : (n, max_L) - txt_lens : list of [input_len] - img_feat : (n, max_num_bb, d) - img_pos_feat : (n, max_num_bb, 7) - num_bbs : list of [num_bb] - attn_masks : (n, max_{L + num_bb}), ie., [1, 1, ..., 0, 0, 1, 1] - img_masks : (n, max_num_bb) between {0, 1} """ ( input_ids, img_feats, img_pos_feats, attn_masks, img_masks, img_mask_tgts, ) = map(list, unzip(inputs)) txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=1) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) # mask features img_masks = pad_sequence(img_masks, batch_first=True, padding_value=0) feat_targets = _get_feat_target(img_feat, img_masks) img_feat = _mask_img_feat(img_feat, img_masks) img_mask_tgt = pad_sequence(img_mask_tgts, batch_first=True, padding_value=0) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = { 'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'feat_targets': feat_targets, 'img_masks': img_masks, 'img_mask_tgt': img_mask_tgt } return batch
def xlmr_mmxlm_softlabel_collate(inputs): """ Return: :input_ids (n, max_L) padded with 1 :position_ids (n, max_L) padded with 0 # Mingyang: doesn't matter for Roberta, as we will regenerate position ids. :txt_lens list of [txt_len] :img_feat (n, max_num_bb, feat_dim) :img_pos_feat (n, max_num_bb, 7) :num_bbs list of [num_bb] :attn_masks (n, max_{L + num_bb}) padded with 0 :img_masks (n, num_bb) :tgt_masks (n, max_{L+num_bb}) padded with 0 :img_token_soft_labels (n, num_bb, label_token_size) """ (input_ids, img_feats, img_pos_feats, attn_masks, img_masks, tgt_masks, img_token_soft_labels ) = map(list, unzip(inputs)) #added by Mingyang Zhou # text batches txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=1) #txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long ).unsqueeze(0) # image batches num_bbs = [f.size(0) for f in img_feats] l_num_bbs = [tl+il for tl,il in zip(txt_lens, num_bbs)] #demonstrate the l+num bbs img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) # for soft_label,l in zip(tgt_token_soft_labels,l): # print(soft_label.size()) # print(l) #tgt_token_soft_labels = pad_tensors(tgt_token_soft_labels, l_num_bbs) #get the target token soft labels img_token_soft_label = pad_tensors(img_token_soft_labels, num_bbs) #Added img_masks img_masks = pad_sequence(img_masks, batch_first=True, padding_value=0) tgt_masks = pad_sequence(tgt_masks, batch_first=True, padding_value=0) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) img_feat = _mask_img_feat(img_feat, img_masks) #masked the corresponding img_feat label_targets = _get_targets(img_masks, img_token_soft_label) #Get the label_targets bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = {'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'img_masks': img_masks, 'tgt_masks': tgt_masks, 'label_targets': label_targets} return batch
def __init__(self, items, dict_remove=None): def remove(k, selfref=ref(self), dict_remove=dict_remove): self = selfref() if self is not None and dict_remove is not None: dict_remove(self) self._items, self._selectors = unzip( self._try_ref(item, remove) for item in items) self._items = tuple(self._items) self._selectors = tuple(self._selectors)
def __init__(self, items, dict_remove=None): def remove(k, selfref=ref(self), dict_remove=dict_remove): self = selfref() if self is not None and dict_remove is not None: dict_remove(self) self._items, self._selectors = unzip(self._try_ref(item, remove) for item in items) self._items = tuple(self._items) self._selectors = tuple(self._selectors)
def convert_batch_gat(unk, word2id, batch): sources, targets, node_infos, edge_infos = list(map(list, unzip(batch))) nodewords, nodelengths, sum_worhies, word_freq_feat, nodefreq = list( unzip(node_infos)) relations, rlengths, triples = list(unzip(edge_infos)) ext_word2id = dict(word2id) for source in sources: for word in source: if word not in ext_word2id: ext_word2id[word] = len(ext_word2id) src_exts = conver2id(unk, ext_word2id, sources) sources = conver2id(unk, word2id, sources) tar_ins = conver2id(unk, word2id, targets) targets = conver2id(unk, ext_word2id, targets) batch = list( zip(sources, src_exts, tar_ins, targets, nodewords, nodelengths, sum_worhies, word_freq_feat, nodefreq, relations, rlengths, triples)) return batch
def coll_fn(data): label_lists, concat_text_lists= unzip(data) # print(type(label_lists)) # print(type(concat_text_lists)) labels = list(label_lists) concat_texts = list(concat_text_lists) assert all(concat_texts) return labels, concat_texts
def batchify_fn(pad, data, cuda=True): source_lists, targets = tuple(map(list, unzip(data))) sources = pad_batch_tensorize(inputs=list(concat(source_lists)), pad=pad, cuda=cuda) tensor_type = torch.cuda.LongTensor if cuda else torch.LongTensor target = tensor_type(list(concat(targets))) fw_args = (sources,) loss_args = (target,) return fw_args, loss_args
def vcr_eval_collate(inputs): (qids, input_ids, position_ids, txt_lens, txt_type_ids, img_feats, img_pos_feats, num_bbs, attn_masks, qa_targets, qar_targets, obj_targets) = map(list, unzip(inputs)) all_num_bbs, all_img_feats, all_img_pos_feats = ([], [], []) all_txt_lens, all_input_ids, all_attn_masks, all_position_ids,\ all_txt_type_ids = ( [], [], [], [], []) # all_qa_targets = qa_targets # all_qar_targets = qar_targets all_obj_targets = [] for i in range(len(num_bbs)): all_input_ids += input_ids[i] all_position_ids += position_ids[i] all_txt_lens += txt_lens[i] all_img_feats += img_feats[i] all_img_pos_feats += img_pos_feats[i] all_num_bbs += num_bbs[i] all_attn_masks += attn_masks[i] all_txt_type_ids += txt_type_ids[i] all_obj_targets += obj_targets[i] all_input_ids = pad_sequence(all_input_ids, batch_first=True, padding_value=0) all_position_ids = pad_sequence(all_position_ids, batch_first=True, padding_value=0) all_txt_type_ids = pad_sequence(all_txt_type_ids, batch_first=True, padding_value=0) all_attn_masks = pad_sequence(all_attn_masks, batch_first=True, padding_value=0) all_obj_targets = pad_sequence(all_obj_targets, batch_first=True, padding_value=0) all_qa_targets = torch.stack(qa_targets, dim=0) all_qar_targets = torch.stack(qar_targets, dim=0) batch_size = len(all_img_feats) num_bb = max(all_num_bbs) feat_dim = all_img_feats[0].size(1) pos_dim = all_img_pos_feats[0].size(1) all_img_feat = torch.zeros(batch_size, num_bb, feat_dim) all_img_pos_feat = torch.zeros(batch_size, num_bb, pos_dim) for i, (im, pos) in enumerate(zip(all_img_feats, all_img_pos_feats)): len_ = im.size(0) all_img_feat.data[i, :len_, :] = im.data all_img_pos_feat.data[i, :len_, :] = pos.data return (qids, all_input_ids, all_position_ids, all_txt_lens, all_txt_type_ids, all_img_feat, all_img_pos_feat, all_num_bbs, all_attn_masks, all_qa_targets, all_qar_targets, all_obj_targets)
def mlm_collate(inputs): """ Return: n = batch-size :input_ids (n, max_L) padded with 0 :position_ids (n, max_L) padded with 0 :txt_lens list of [txt_len] :img_feat (n, max_num_bb, feat_dim) :img_pos_feat (n, max_num_bb, 7) :num_bbs list of [num_bb] :attn_masks (n, max_{L + num_bb}) padded with 0 :attn_masks_txt (n, max_L) padded with 0 :attn_masks_img (n, max_num_bb) padded with 0 :txt_labels (n, max_L) padded with -1 """ (input_ids, img_feats, img_pos_feats, attn_masks, txt_labels, attn_masks_txt, attn_masks_img) = map(list, unzip(inputs)) # text batches txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) txt_labels = pad_sequence(txt_labels, batch_first=True, padding_value=-1) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) # image batches num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) # lrc add for two-flow attn_masks_txt = pad_sequence(attn_masks_txt, batch_first=True, padding_value=0) attn_masks_img = pad_sequence(attn_masks_img, batch_first=True, padding_value=0) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) # = attn_masks.size()[1] gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = { 'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'attn_masks_txt': attn_masks_txt, 'attn_masks_img': attn_masks_img, 'gather_index': gather_index, 'txt_labels': txt_labels } return batch
def itm_ot_collate(inputs): (input_ids, img_feats, img_pos_feats, attn_masks, targets, attn_masks_txt, attn_masks_img) = map(list, unzip(inputs)) txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) # lrc add start attn_masks_txt = pad_sequence(attn_masks_txt, batch_first=True, padding_value=0) attn_masks_img = pad_sequence(attn_masks_img, batch_first=True, padding_value=0) targets = torch.cat(targets, dim=0) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) # OT inputs max_tl = max(txt_lens) max_nbb = max(num_bbs) ot_scatter = _compute_ot_scatter(txt_lens, max_tl, attn_masks.size(1)) txt_pad = _compute_pad(txt_lens, max_tl) img_pad = _compute_pad(num_bbs, max_nbb) ot_inputs = { 'ot_scatter': ot_scatter, 'scatter_max': ot_scatter.max().item(), 'txt_pad': txt_pad, 'img_pad': img_pad } batch = { 'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'attn_masks_txt': attn_masks_txt, 'attn_masks_img': attn_masks_img, 'gather_index': gather_index, 'targets': targets, 'ot_inputs': ot_inputs } return batch
def parse_input(batch_data, do_sample=False): input_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['error_token_ids'], fill_value=0))) input_line_length = to_cuda( torch.LongTensor(PaddedList(batch_data['error_line_length']))) input_line_token_length = to_cuda( torch.LongTensor(PaddedList( batch_data['error_line_token_length']))) input_length = to_cuda( torch.LongTensor(PaddedList(batch_data['error_token_length']))) if not use_ast: adj_matrix = to_cuda(torch.LongTensor(batch_data['adj'])) else: adjacent_tuple = [[[i] + tt for tt in t] for i, t in enumerate(batch_data['adj'])] adjacent_tuple = [ list(t) for t in unzip(more_itertools.flatten(adjacent_tuple)) ] size = max(batch_data['error_token_length']) # print("max length in this batch:{}".format(size)) adjacent_tuple = torch.LongTensor(adjacent_tuple) adjacent_values = torch.ones(adjacent_tuple.shape[1]).long() adjacent_size = torch.Size( [len(batch_data['error_token_length']), size, size]) info('batch_data input_length: ' + str(batch_data['error_token_length'])) info('size: ' + str(size)) info('adjacent_tuple: ' + str(adjacent_tuple.shape)) info('adjacent_size: ' + str(adjacent_size)) adj_matrix = to_cuda( torch.sparse.LongTensor( adjacent_tuple, adjacent_values, adjacent_size, ).float().to_dense()) if not do_sample: target_error_position = to_cuda( torch.LongTensor(PaddedList(batch_data['error_line']))) target_seq = to_cuda( torch.LongTensor( PaddedList(batch_data['target_line_ids'], fill_value=ignore_id))) target_length = to_cuda( torch.LongTensor(PaddedList(batch_data['target_line_length']))) else: target_error_position = None target_seq = None target_length = None return input_seq, input_line_length, input_line_token_length, input_length, adj_matrix, target_error_position, target_seq, target_length
def collate(inputs): video_inputs, all_clip_ranges, cap_inputs = map(list, unzip(inputs)) (all_input_ids, all_tgt_ids, all_attn_masks ) = map(list, unzip(concat(outs for outs in cap_inputs))) input_ids = pad_sequence(all_input_ids, batch_first=True, padding_value=1) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long ).unsqueeze(0) tgt_ids = pad_sequence(all_tgt_ids, batch_first=True, padding_value=-1) attn_mask = pad_sequence(all_attn_masks, batch_first=True, padding_value=0) batch = {'cap_input_ids': input_ids, 'cap_pos_ids': position_ids, 'cap_tgt_ids': tgt_ids, 'cap_attn_mask': attn_mask, 'clip_ranges': tuple(map(tuple, all_clip_ranges))} vid_batch = video_collate(video_inputs) batch.update(vid_batch) return batch
def batchify_fn_extract_ff(pad, data, cuda=True): source_lists, targets = tuple(map(list, unzip(data))) src_nums = list(map(len, source_lists)) sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists)) tensor_type = torch.cuda.FloatTensor if cuda else torch.FloatTensor target = tensor_type(list(concat(targets))) fw_args = (sources, src_nums) loss_args = (target,) return fw_args, loss_args
def batchify_fn_extract_ff(pad, data, cuda=True): source_lists, targets = tuple(map(list, unzip(data))) src_nums = list(map(len, source_lists)) sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists)) tensor_type = torch.cuda.FloatTensor if cuda else torch.FloatTensor target = tensor_type(list(concat(targets))) fw_args = (sources, src_nums) loss_args = (target, ) return fw_args, loss_args
def convert_batch_copy(unk, word2id, batch): sources, targets = map(list, unzip(batch)) ext_word2id = dict(word2id) for source in sources: for word in source: if word not in ext_word2id: ext_word2id[word] = len(ext_word2id) src_exts = conver2id(unk, ext_word2id, sources) sources = conver2id(unk, word2id, sources) tar_ins = conver2id(unk, word2id, targets) targets = conver2id(unk, ext_word2id, targets) batch = list(zip(sources, src_exts, tar_ins, targets)) return batch
def batchify_fn(pad, start, end, data, cuda=True): sources, targets = tuple(map(list, unzip(data))) src_lens = [len(src) for src in sources] tar_ins = [[start] + tgt for tgt in targets] targets = [tgt + [end] for tgt in targets] source = pad_batch_tensorize(sources, pad, cuda) tar_in = pad_batch_tensorize(tar_ins, pad, cuda) target = pad_batch_tensorize(targets, pad, cuda) fw_args = (source, src_lens, tar_in) loss_args = (target, ) return fw_args, loss_args
def batchify_fn_extract_ptr(pad, data, cuda=True): source_lists, targets = tuple(map(list, unzip(data))) src_nums = list(map(len, source_lists)) sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists)) # PAD is -1 (dummy extraction index) for using sequence loss target = pad_batch_tensorize(targets, pad=-1, cuda=cuda) remove_last = lambda tgt: tgt[:-1] tar_in = pad_batch_tensorize( list(map(remove_last, targets)), pad=-0, cuda=cuda # use 0 here for feeding first conv sentence repr. ) fw_args = (sources, src_nums, tar_in) loss_args = (target, ) return fw_args, loss_args
def batchify_fn_copy(pad, start, end, data, cuda=True): sources, ext_srcs, tar_ins, targets = tuple(map(list, unzip(data))) src_lens = [len(src) for src in sources] sources = [src for src in sources] ext_srcs = [ext for ext in ext_srcs] tar_ins = [[start] + tgt for tgt in tar_ins] targets = [tgt + [end] for tgt in targets] source = pad_batch_tensorize(sources, pad, cuda) tar_in = pad_batch_tensorize(tar_ins, pad, cuda) target = pad_batch_tensorize(targets, pad, cuda) ext_src = pad_batch_tensorize(ext_srcs, pad, cuda) ext_vsize = ext_src.max().item() + 1 fw_args = (source, src_lens, tar_in, ext_src, ext_vsize) loss_args = (target, ) return fw_args, loss_args
def convert_batch(unk, word2id, batch): sources, targets = unzip(batch) sources = conver2id(unk, word2id, sources) targets = conver2id(unk, word2id, targets) batch = list(zip(sources, targets)) return batch