Ejemplo n.º 1
0
def main(args):
    dataset = args.dataset

    for split in ('train', 'dev', 'test'):
        dir_path = f'summ_data/{dataset}_sent'
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)

        elems = load_elems(f'summ_data/{dataset}/{split}.txt')
        ext_elems = load_elems(f'summ_data/{dataset}/{split}_ext.txt')

        ext_elem_ids = set([ext_elem['name'] for ext_elem in ext_elems])
        elems = [elem for elem in elems if elem['name'] in ext_elem_ids]
        assert len(elems) == len(ext_elems)

        out_elems = []
        out_ext_elems = []

        for (elem, ext_elem) in tqdm(zip(elems, ext_elems)):
            assert elem['name'] == ext_elem['name']

            name = elem['name']
            part = f'{dataset}_sent'
            abs_list = elem['abs_list']

            label_list = ext_elem['label_list']
            _doc_list = elem['doc_list']
            _parse_list = elem['parse_list']

            if len(_parse_list) == 0:
                _parse_list = ['']

            doc_list = []
            parse_list = []

            # Take sentences according to extractive oracle.
            for (label, doc, parse) in zip(label_list, _doc_list, _parse_list):
                if label == 1:
                    doc_list.append(doc)
                    parse_list.append(parse)

            # Create per-sentence dataset where the extractive oracle is hardcoded to
            # always pick up that sentence.
            for (sent, parse) in zip(doc_list, parse_list):
                out_elems.append({
                    'name': name,
                    'part': part,
                    'abs_list': abs_list,
                    'doc_list': [sent],
                    'parse_list': [parse]
                })
                out_ext_elems.append({
                    'name': name,
                    'part': part,
                    'label_list': [1]
                })

        write_elems(out_elems, f'summ_data/{dataset}_sent/{split}.txt')
        write_elems(out_ext_elems, f'summ_data/{dataset}_sent/{split}_ext.txt')
Ejemplo n.º 2
0
    def __init__(self, path, test_mode=False):
        elems = load_elems(path)
        if args.max_samples != -1 and not test_mode:
            elems = elems[:args.max_samples]

        oracle_path = get_oracle_path(path, 'ext')
        oracle_elems = load_elems(oracle_path)

        if args.max_samples != -1 and not test_mode:
            oracle_elems = oracle_elems[:args.max_samples]

        processor = SummarizationProcessor(elems, oracle_elems)
        self.doc_lists, self.abs_lists, self.label_lists = processor.get_samples(
        )
        self.test_mode = test_mode
Ejemplo n.º 3
0
    def __init__(self, path, test_mode=False):
        elems = load_elems(path)
        ext_oracle_elems = load_elems(get_oracle_path(path, 'ext'))
        cmp_oracle_elems = load_elems(get_oracle_path(path, 'cmp'))

        if args.max_samples != -1 and not test_mode:
            elems = elems[:args.max_samples]
            ext_oracle_elems = ext_oracle_elems[:args.max_samples]
            cmp_oracle_elems = cmp_oracle_elems[:args.max_samples]

        processor = SummarizationProcessor(elems, ext_oracle_elems,
                                           cmp_oracle_elems)

        self.abs_lists, self.doc_lists, self.node_lists = processor.get_samples(
        )
        self.test_mode = test_mode
Ejemplo n.º 4
0
    if args.do_compress:
        cmp_tokenizer = AutoTokenizer.from_pretrained(args.cmp_model)
        cmp_model = cuda(CompressionModel(args.cmp_model, 0.))
        cmp_model.load_state_dict(torch.load(args.cmp_ckpt_path, map_location=f'cuda:{args.device}'))
        cmp_model.eval()
        print('loaded compression model')

    if args.do_grammar:
        grm_tokenizer = AutoTokenizer.from_pretrained(args.cmp_model)
        grm_model = cuda(CompressionModel(args.cmp_model, 0.))
        grm_model.load_state_dict(torch.load(args.grm_ckpt_path, map_location=f'cuda:{args.device}'))
        grm_model.eval()
        print('loaded grammar model')

    test_elems = load_elems(args.test_path)
    if args.quick_test:
        random.shuffle(test_elems)
        test_elems = test_elems[:100]
    
    test_loader = test_elems
    if not args.do_viz:
        test_loader = tqdm(test_elems, ncols=100)

    hyp_list = []
    ref_list = []
    rate_list = []

    if args.output_path:
        output_elems = []
Ejemplo n.º 5
0
def compression_metrics(abs_list, doc_list):
    abs_tokens = [word for sent in abs_list for word in sent]
    doc_tokens = [word for sent in doc_list for word in sent]
    ext_spans = build_extractive_spans(abs_tokens, doc_tokens)
    coverage = extractive_coverage(abs_tokens, ext_spans)
    density = extractive_density(abs_tokens, ext_spans)
    return (coverage, density)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', type=str)
    args = parser.parse_args()
    
    elems = load_elems(args.path)
    coverage_list = []
    density_list = []

    for elem in tqdm(elems, ncols=100):
        coverage, density = compression_metrics(
            elem['abs_list'], elem['doc_list']
        )
        coverage_list.append(coverage)
        density_list.append(density)

    out_elems = []
    for (x,y) in zip(coverage_list, density_list):
        out_elems.append({'coverage': x, 'density': y})

    with open('curation_cmp.txt', 'w+') as f:
Ejemplo n.º 6
0
    return compressions_list


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_path', type=str)
    parser.add_argument('--ext_oracle_path', type=str)
    parser.add_argument('--output_path', type=str)
    args = parser.parse_args()
    print(args)
    print()

    compression_oracle_elems = []

    input_loader = load_elems(args.input_path)
    extractor_loader = load_elems(args.ext_oracle_path)
    loader = tqdm(zip(input_loader, extractor_loader), ncols=100)
    empty_slots = 0

    for (i, (elem, oracle_elem)) in enumerate(loader, 1):
        abs_list = elem['abs_list']
        _doc_list = elem['doc_list']
        _parse_list = elem['parse_list']

        extractor_oracle_labels = oracle_elem['label_list']

        doc_list = [
            sent_list
            for (sent_list,
                 sent_label) in zip(_doc_list, extractor_oracle_labels)
Ejemplo n.º 7
0
        if curr_id == -1:
            return (list(sorted(selected_idxs)), max_rouge)

        selected_idxs.append(curr_id)
        max_rouge = curr_max_rouge

    return (list(sorted(selected_idxs)), max_rouge)


if __name__ == '__main__':
    hyp_list = []
    ref_list = []

    oracle_elems = []

    loader = tqdm(load_elems(args.input_path), ncols=100)
    rouge_sum = 0.

    for i, elem in enumerate(loader, 1):
        doc_list = elem['doc_list']
        abs_list = elem['abs_list']

        if sum(map(len, doc_list)) > 0:
            doc_list = [sent[:200] for sent in doc_list][:100]
            doc_list = limit_doc(doc_list)
            oracle_idxs, oracle_rouge = greedy_search(abs_list,
                                                      doc_list,
                                                      budget=args.ext_size)
            oracle_tokens = [doc_list[idx] for idx in oracle_idxs]
        else:
            oracle_idxs = []
Ejemplo n.º 8
0
def compression_rate(sent_labels):
    return sum(sent_labels) / len(sent_labels)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_path', type=str)
    parser.add_argument('--output_path', type=str)
    args = parser.parse_args()
    print(args)
    print()

    random.seed(0)

    doc_elems = load_elems(args.input_path)
    sent_elems = []

    total = 0

    for doc_elem in tqdm(doc_elems):
        name = doc_elem['name']
        part = doc_elem['part']

        doc_list = doc_elem['doc_list']
        tree_list = doc_elem['doc_parse']
        label_list = doc_elem['label_list']

        # Find candidate sentences that meet compression threshold
        candidate_sents = [
            (sent_list, sent_parse, sent_labels)