コード例 #1
0
def data_loader_checker():
    file_path = '../data/hotpotqa/distractor_qa'
    dev_file_name = 'hotpot_dev_distractor_wiki_tokenized.json'
    from torch.utils.data import DataLoader
    batch_size = 6

    data_frame = read_train_dev_data_frame(PATH=file_path,
                                           json_fileName=dev_file_name)
    for col in data_frame.columns:
        print(col)
    longtokenizer = get_hotpotqa_longformer_tokenizer()
    hotpot_tensorizer = LongformerQATensorizer(tokenizer=longtokenizer,
                                               max_length=4096)
    start_time = time()
    dev_dataloader = DataLoader(HotpotDevDataset(
        data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer),
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=6,
                                collate_fn=HotpotDevDataset.collate_fn)

    for batch_idx, sample in enumerate(dev_dataloader):
        x = sample['doc_start']
        # print(sample['doc_start'].shape)
        # print(sample['sent_start'].shape)
    print('Runtime = {}'.format(time() - start_time))
コード例 #2
0
def test_data_loader_checker():
    file_path = '../data/hotpotqa/distractor_qa'
    dev_file_name = 'hotpot_dev_distractor_wiki_tokenized.json'
    from torch.utils.data import DataLoader
    batch_size = 1
    data_frame = read_train_dev_data_frame(PATH=file_path,
                                           json_fileName=dev_file_name)
    longtokenizer = get_hotpotqa_longformer_tokenizer()
    hotpot_tensorizer = LongformerQATensorizer(tokenizer=longtokenizer,
                                               max_length=4096)
    start_time = time()
    test_dataloader = DataLoader(HotpotDevDataset(
        data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer),
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=1,
                                 collate_fn=HotpotDevDataset.collate_fn)
    for batch_idx, sample in enumerate(test_dataloader):
        sd_mask = sample['sd_mask']
        # print(sd_mask)
        # print(sd_mask[0])
        print(sample['doc_lens'])
        print(sample['sent_lens'])

        ss_mask = sample['ss_mask']
        # print(ss_mask[0].detach().tolist())
        print(ss_mask.shape)
        print(ss_mask[0].sum(dim=1))
        print(sd_mask.shape)
        break
    print('Runtime = {}'.format(time() - start_time))
コード例 #3
0
def answer_consistent_checker():
    file_path = '../data/hotpotqa/distractor_qa'
    dev_file_name = 'hotpot_dev_distractor_wiki_tokenized.json'
    from torch.utils.data import DataLoader
    batch_size = 1

    data_frame = read_train_dev_data_frame(PATH=file_path,
                                           json_fileName=dev_file_name)
    print(data_frame['answer_len'].max())
    # for col in data_frame.columns:
    #     print(col)
    longtokenizer = get_hotpotqa_longformer_tokenizer()
    hotpot_tensorizer = LongformerQATensorizer(tokenizer=longtokenizer,
                                               max_length=4096)
    start_time = time()
    dev_dataloader = DataLoader(HotpotTrainDataset(
        data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer),
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=14,
                                collate_fn=HotpotTrainDataset.collate_fn)
    max_seq_len = 0
    average_seq_len = 0
    count = 0
    max_answer_len = 0
    for batch_idx, sample in enumerate(dev_dataloader):
        # if batch_idx % 1000 == 0:
        #     print(batch_idx)
        ctx_encode = sample['ctx_encode']
        ctx_encode_lens = sample['doc_lens']

        answer_start = sample['ans_start'].squeeze(dim=-1)
        answer_end = sample['ans_end'].squeeze(dim=-1)
        doc_start = sample['doc_start'].squeeze(dim=-1)
        doc_end = sample['doc_end'].squeeze(dim=-1)
        sent_start = sample['sent_start'].squeeze(dim=-1)
        batch_size = ctx_encode.shape[0]
        for id in range(batch_size):
            # doc_token_num = ctx_encode_lens[id].sum().data.item()
            doc_token_num = doc_end[id].detach().tolist()[-1]
            if max_seq_len < doc_token_num:
                max_seq_len = doc_token_num
            average_seq_len = average_seq_len + doc_token_num
            count = count + 1
            doc_start_i = doc_start[id]
            sent_start_i = sent_start[id]
            ctx_encode_i = ctx_encode[id]
            ans_start_i = answer_start[id].data.item()
            ans_end_i = answer_end[id].data.item()
            if max_answer_len < (ans_end_i - ans_start_i) + 1:
                max_answer_len = (ans_end_i - ans_start_i) + 1
            decode_answer = longtokenizer.decode(
                ctx_encode_i[ans_start_i:(ans_end_i + 1)])

            # print('{}\t{}'.format(batch_idx, decode_answer))
            # if '<p>' in decode_answer or '<d>' in decode_answer or '<q>' in decode_answer or '</q>' in decode_answer:
            #     print('index = {}'.format(batch_idx))
            #     print('decode answer {}'.format(decode_answer))
            #     print('Decode Query {}'.format(longtokenizer.decode(ctx_encode_i[:doc_start_i[0]])))
            # print('decode answer {}'.format(decode_answer))

    print('max seq len: {} average seq len: {}, {}'.format(
        max_seq_len, average_seq_len / count, count))
    print('max answer len: {}'.format(max_answer_len))
    return
コード例 #4
0
def data_consistent_checker(train=True):
    file_path = '../data/hotpotqa/distractor_qa'
    from torch.utils.data import DataLoader
    batch_size = 2
    longtokenizer = get_hotpotqa_longformer_tokenizer()
    hotpot_tensorizer = LongformerQATensorizer(tokenizer=longtokenizer,
                                               max_length=4096)
    if train:
        dev_file_name = 'hotpot_train_distractor_wiki_tokenized.json'
        data_frame = read_train_dev_data_frame(PATH=file_path,
                                               json_fileName=dev_file_name)
        start_time = time()
        dev_dataloader = DataLoader(HotpotTrainDataset(
            data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer),
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=1,
                                    collate_fn=HotpotTrainDataset.collate_fn)
    else:
        dev_file_name = 'hotpot_dev_distractor_wiki_tokenized.json'
        data_frame = read_train_dev_data_frame(PATH=file_path,
                                               json_fileName=dev_file_name)
        start_time = time()
        dev_dataloader = DataLoader(HotpotDevDataset(
            data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer),
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=1,
                                    collate_fn=HotpotDevDataset.collate_fn)

    batch_data_frame = data_frame.head(batch_size)
    print(batch_data_frame.shape)
    for idx, row in batch_data_frame.iterrows():
        context = row['context']
        supp_fact_filtered = row['supp_facts_filtered']
        # for supp, sen_idx in supp_fact_filtered:
        #     print('Support doc: {}, sent id: {}'.format(supp, sen_idx))
        print('Query {}'.format(row['question']))
        for doc_idx, doc in enumerate(context):
            # print('doc {}: title = {} \n text = {}'.format(doc_idx + 1, doc[0], '\n'.join(doc[1])))
            print('doc {}: title = {}'.format(doc_idx + 1, doc[0]))
            for supp, sen_idx in supp_fact_filtered:
                if doc[0] == supp:
                    print('supp fact doc {}: sent = {} text = {}'.format(
                        doc_idx, sen_idx, doc[1][sen_idx]))
        print('*' * 70)
        print('Original answer = {}'.format(row['norm_answer']))
        print('=' * 70)
    print('+' * 70)
    print('\n' * 3)

    for batch_idx, sample in enumerate(dev_dataloader):
        # for key, value in sample.items():
        #     print(key)
        ctx_encode = sample['ctx_encode']
        ctx_marker_mask = sample['marker']
        global_atten = sample['ctx_global_mask']
        atten_mask = sample['ctx_attn_mask']
        sup_sent_labels = sample['sent_labels'].squeeze(dim=-1)
        sent2doc_map = sample['s2d_map']
        sentIndoc_map = sample['sInd_map']
        sent_start = sample['sent_start']
        sent_end = sample['sent_end']
        # print('sent num = {}'.format(sent_end.shape[1]))
        answer_start = sample['ans_start'].squeeze(dim=-1)
        answer_end = sample['ans_end'].squeeze(dim=-1)
        doc_start = sample['doc_start'].squeeze(dim=-1)
        token2sent_map = sample['t2s_map'].squeeze(dim=-1)
        if train:
            head_idx = sample['head_idx'].squeeze(dim=-1)
            tail_idx = sample['tail_idx'].squeeze(dim=-1)

        for id in range(batch_size):
            ctx_marker_i = ctx_marker_mask[id]
            supp_idxes = (sup_sent_labels[id] > 0).nonzero().squeeze()
            doc_idxes = sent2doc_map[id][supp_idxes].detach().tolist()
            sent_idxes = sentIndoc_map[id][supp_idxes].detach().tolist()
            doc_start_i = doc_start[id]
            doc_sent_pairs = list(zip(doc_idxes, sent_idxes))
            sent_start_i = sent_start[id]
            sent_end_i = sent_end[id]
            ctx_encode_i = ctx_encode[id]
            token2sent_map_i = token2sent_map[id]

            # print('token to sentence {}'.format(token2sent_map_i.max()))
            max_sent_num = token2sent_map_i.max().data.item()
            for ssss_id in range(max_sent_num):
                sent_iiii_idexs = (
                    token2sent_map_i == ssss_id).nonzero().squeeze()
                print('sent {} text = {}'.format(
                    ssss_id,
                    longtokenizer.decode(ctx_encode_i[sent_iiii_idexs])))

            if train:
                print('head doc idx = {}'.format(head_idx[id]))
                print('tail doc idx = {}'.format(tail_idx[id]))

            global_atten_i = global_atten[id]
            global_atten_i_indexes = (global_atten_i > 0).nonzero().squeeze()
            global_atten_text = longtokenizer.decode(
                ctx_encode_i[global_atten_i_indexes])
            print('global attention text: {}'.format(global_atten_text))

            atten_i = atten_mask[id]
            atten_i_indexes = (atten_i > 0).nonzero().squeeze()
            atten_text = longtokenizer.decode(ctx_encode_i[atten_i_indexes])
            # print('attention text: {}'.format(atten_text))
            print('x' * 75)
            # print('decode text: {}'.format(longtokenizer.decode(ctx_encode_i)))

            ans_start_i = answer_start[id].data.item()
            ans_end_i = answer_end[id].data.item()
            #
            print('Decode Query {}'.format(
                longtokenizer.decode(ctx_encode_i[:doc_start_i[0]])))
            print('Decode Answer {}'.format(
                longtokenizer.decode(ctx_encode_i[ans_start_i:(ans_end_i +
                                                               1)])))

            ctx_marker_i_indexes = (ctx_marker_i > 0).nonzero().squeeze()
            print('Decode marker text = {}'.format(
                longtokenizer.decode(ctx_encode_i[ctx_marker_i_indexes])))
            for ss_id, x in enumerate(doc_sent_pairs):
                supp_idddd = supp_idxes[ss_id]
                start_i, end_i = sent_start_i[
                    supp_idddd], sent_end_i[supp_idddd] + 1
                print('doc {}, sent {}, text {}'.format(
                    x[0], x[1],
                    longtokenizer.decode(ctx_encode_i[start_i:end_i])))
            print('=' * 70)
        break
    return
コード例 #5
0
def data_loader_consistent_checker(train=True):
    file_path = '../data/hotpotqa/distractor_qa'
    if train:
        dev_file_name = 'hotpot_train_distractor_wiki_tokenized.json'
    else:
        dev_file_name = 'hotpot_dev_distractor_wiki_tokenized.json'
    data_frame = read_train_dev_data_frame(PATH=file_path,
                                           json_fileName=dev_file_name)
    longtokenizer = get_hotpotqa_longformer_tokenizer()
    hotpot_tensorizer = LongformerQATensorizer(tokenizer=longtokenizer,
                                               max_length=4096)
    start_time = time()
    from torch.utils.data import DataLoader
    batch_size = 1
    if train:
        dev_dataloader = DataLoader(HotpotTrainDataset(
            data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer),
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=1,
                                    collate_fn=HotpotTrainDataset.collate_fn)
    else:
        dev_dataloader = DataLoader(HotpotDevDataset(
            data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer),
                                    batch_size=batch_size,
                                    shuffle=False,
                                    num_workers=1,
                                    collate_fn=HotpotDevDataset.collate_fn)

    head_two = data_frame.head(batch_size)
    print(type(head_two))
    for idx, row in head_two.iterrows():
        context = row['context']
        supp_fact_filtered = row['supp_facts_filtered']
        for supp, sen_idx in supp_fact_filtered:
            print('Support doc: {}, sent id: {}'.format(supp, sen_idx))
            print('-' * 70)
        print()
        print('Query {}'.format(row['question']))
        for doc_idx, doc in enumerate(context):
            print('doc {}: title = {} \n text = {}'.format(
                doc_idx + 1, doc[0], ' '.join(doc[1])))
            print('-' * 70)
        print('*' * 70)
        print()
        print('Original answer = {}'.format(row['norm_answer']))
        print('=' * 70)
    print('+' * 70)
    print('\n' * 5)
    for batch_idx, sample in enumerate(dev_dataloader):
        ctx_encode = sample['ctx_encode']
        doc_start = sample['doc_start'].squeeze(dim=-1)
        sent_start = sample['sent_start'].squeeze(dim=-1)
        answer_start = sample['ans_start'].squeeze(dim=-1)
        answer_end = sample['ans_end'].squeeze(dim=-1)
        if train:
            head_idx = sample['head_idx'].squeeze(dim=-1)
            tail_idx = sample['tail_idx'].squeeze(dim=-1)
        sent_lens = sample['sent_lens'].squeeze(dim=-1)
        attention = sample['ctx_attn_mask'].squeeze(dim=-1)
        global_attenion = sample['ctx_global_mask']
        print('global attention {}'.format(global_attenion))
        marker = sample['marker'].squeeze(dim=-1)

        doc_num = doc_start.shape[1]
        print('doc num: {}'.format(doc_start.shape))
        print('marker {}'.format(marker))
        print('marker shape {}'.format(marker.shape))

        for idx in range(ctx_encode.shape[0]):
            ctx_i = ctx_encode[idx]
            marker_i = marker[idx]

            marker_idx = marker_i.nonzero().squeeze()
            print('marker text {}'.format(
                longtokenizer.decode(ctx_i[marker_idx])))
            print('*' * 75)
            attention_i = attention[idx]
            attn_idx = (attention_i == 1).nonzero().squeeze()
            print('attn text {}'.format(longtokenizer.decode(ctx_i[attn_idx])))
            sent_start_i = sent_start[idx]
            doc_start_i = doc_start[idx]
            if train:
                head_i = head_idx[idx].data.item()
                tail_i = tail_idx[idx].data.item()
            ans_start_i = answer_start[idx].data.item()
            ans_end_i = answer_end[idx].data.item()

            print('Decode Query {}'.format(
                longtokenizer.decode(ctx_i[:doc_start_i[0]])))
            print('*' * 75)
            print('Decoded answer = {}'.format(
                hotpot_tensorizer.to_string(ctx_i[ans_start_i:(ans_end_i +
                                                               1)])))
            print('*' * 75)
            # print(ans_start_i)

            doc_marker = longtokenizer.decode(ctx_i[doc_start_i])
            print('doc_marker: {}'.format(doc_marker))

            sent_marker = longtokenizer.decode(ctx_i[sent_start_i])
            print('doc: {}\nsent: {}\n{}\n{}'.format(doc_marker, sent_marker,
                                                     sent_start_i.shape,
                                                     sent_lens[idx]))
            print('*' * 75)

            for k in range(doc_num):
                if k < doc_num - 1:
                    # doc_k = hotpot_tensorizer.to_string(ctx_i[doc_start_i[k]:doc_start_i[k+1]])
                    doc_k = longtokenizer.decode(
                        ctx_i[doc_start_i[k]:doc_start_i[k + 1]])
                else:
                    # doc_k = hotpot_tensorizer.to_string(ctx_i[doc_start_i[k]:])
                    doc_k = longtokenizer.decode(ctx_i[doc_start_i[k]:])
                # print(doc_marker)
                print('Supp doc {}: text = {}'.format(k + 1, doc_k))
                if train:
                    if k == head_i:
                        print('=' * 70)
                        print('Head positive doc {}: text: {}'.format(
                            head_i + 1, doc_k))
                        print('=' * 70)
                    if k == tail_i:
                        print('=' * 70)
                        print('Tail positive doc {}: text: {}'.format(
                            tail_i + 1, doc_k))
                        print('=' * 70)
                    print('-' * 70)
            print('*' * 70)
            print()
        # print(ctx_encode.shape)
        break
    print('Runtime = {}'.format(time() - start_time))