def main(args): dataset = args.dataset for split in ('train', 'dev', 'test'): dir_path = f'summ_data/{dataset}_sent' if not os.path.exists(dir_path): os.mkdir(dir_path) elems = load_elems(f'summ_data/{dataset}/{split}.txt') ext_elems = load_elems(f'summ_data/{dataset}/{split}_ext.txt') ext_elem_ids = set([ext_elem['name'] for ext_elem in ext_elems]) elems = [elem for elem in elems if elem['name'] in ext_elem_ids] assert len(elems) == len(ext_elems) out_elems = [] out_ext_elems = [] for (elem, ext_elem) in tqdm(zip(elems, ext_elems)): assert elem['name'] == ext_elem['name'] name = elem['name'] part = f'{dataset}_sent' abs_list = elem['abs_list'] label_list = ext_elem['label_list'] _doc_list = elem['doc_list'] _parse_list = elem['parse_list'] if len(_parse_list) == 0: _parse_list = [''] doc_list = [] parse_list = [] # Take sentences according to extractive oracle. for (label, doc, parse) in zip(label_list, _doc_list, _parse_list): if label == 1: doc_list.append(doc) parse_list.append(parse) # Create per-sentence dataset where the extractive oracle is hardcoded to # always pick up that sentence. for (sent, parse) in zip(doc_list, parse_list): out_elems.append({ 'name': name, 'part': part, 'abs_list': abs_list, 'doc_list': [sent], 'parse_list': [parse] }) out_ext_elems.append({ 'name': name, 'part': part, 'label_list': [1] }) write_elems(out_elems, f'summ_data/{dataset}_sent/{split}.txt') write_elems(out_ext_elems, f'summ_data/{dataset}_sent/{split}_ext.txt')
def __init__(self, path, test_mode=False): elems = load_elems(path) if args.max_samples != -1 and not test_mode: elems = elems[:args.max_samples] oracle_path = get_oracle_path(path, 'ext') oracle_elems = load_elems(oracle_path) if args.max_samples != -1 and not test_mode: oracle_elems = oracle_elems[:args.max_samples] processor = SummarizationProcessor(elems, oracle_elems) self.doc_lists, self.abs_lists, self.label_lists = processor.get_samples( ) self.test_mode = test_mode
def __init__(self, path, test_mode=False): elems = load_elems(path) ext_oracle_elems = load_elems(get_oracle_path(path, 'ext')) cmp_oracle_elems = load_elems(get_oracle_path(path, 'cmp')) if args.max_samples != -1 and not test_mode: elems = elems[:args.max_samples] ext_oracle_elems = ext_oracle_elems[:args.max_samples] cmp_oracle_elems = cmp_oracle_elems[:args.max_samples] processor = SummarizationProcessor(elems, ext_oracle_elems, cmp_oracle_elems) self.abs_lists, self.doc_lists, self.node_lists = processor.get_samples( ) self.test_mode = test_mode
if args.do_compress: cmp_tokenizer = AutoTokenizer.from_pretrained(args.cmp_model) cmp_model = cuda(CompressionModel(args.cmp_model, 0.)) cmp_model.load_state_dict(torch.load(args.cmp_ckpt_path, map_location=f'cuda:{args.device}')) cmp_model.eval() print('loaded compression model') if args.do_grammar: grm_tokenizer = AutoTokenizer.from_pretrained(args.cmp_model) grm_model = cuda(CompressionModel(args.cmp_model, 0.)) grm_model.load_state_dict(torch.load(args.grm_ckpt_path, map_location=f'cuda:{args.device}')) grm_model.eval() print('loaded grammar model') test_elems = load_elems(args.test_path) if args.quick_test: random.shuffle(test_elems) test_elems = test_elems[:100] test_loader = test_elems if not args.do_viz: test_loader = tqdm(test_elems, ncols=100) hyp_list = [] ref_list = [] rate_list = [] if args.output_path: output_elems = []
def compression_metrics(abs_list, doc_list): abs_tokens = [word for sent in abs_list for word in sent] doc_tokens = [word for sent in doc_list for word in sent] ext_spans = build_extractive_spans(abs_tokens, doc_tokens) coverage = extractive_coverage(abs_tokens, ext_spans) density = extractive_density(abs_tokens, ext_spans) return (coverage, density) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--path', type=str) args = parser.parse_args() elems = load_elems(args.path) coverage_list = [] density_list = [] for elem in tqdm(elems, ncols=100): coverage, density = compression_metrics( elem['abs_list'], elem['doc_list'] ) coverage_list.append(coverage) density_list.append(density) out_elems = [] for (x,y) in zip(coverage_list, density_list): out_elems.append({'coverage': x, 'density': y}) with open('curation_cmp.txt', 'w+') as f:
return compressions_list if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input_path', type=str) parser.add_argument('--ext_oracle_path', type=str) parser.add_argument('--output_path', type=str) args = parser.parse_args() print(args) print() compression_oracle_elems = [] input_loader = load_elems(args.input_path) extractor_loader = load_elems(args.ext_oracle_path) loader = tqdm(zip(input_loader, extractor_loader), ncols=100) empty_slots = 0 for (i, (elem, oracle_elem)) in enumerate(loader, 1): abs_list = elem['abs_list'] _doc_list = elem['doc_list'] _parse_list = elem['parse_list'] extractor_oracle_labels = oracle_elem['label_list'] doc_list = [ sent_list for (sent_list, sent_label) in zip(_doc_list, extractor_oracle_labels)
if curr_id == -1: return (list(sorted(selected_idxs)), max_rouge) selected_idxs.append(curr_id) max_rouge = curr_max_rouge return (list(sorted(selected_idxs)), max_rouge) if __name__ == '__main__': hyp_list = [] ref_list = [] oracle_elems = [] loader = tqdm(load_elems(args.input_path), ncols=100) rouge_sum = 0. for i, elem in enumerate(loader, 1): doc_list = elem['doc_list'] abs_list = elem['abs_list'] if sum(map(len, doc_list)) > 0: doc_list = [sent[:200] for sent in doc_list][:100] doc_list = limit_doc(doc_list) oracle_idxs, oracle_rouge = greedy_search(abs_list, doc_list, budget=args.ext_size) oracle_tokens = [doc_list[idx] for idx in oracle_idxs] else: oracle_idxs = []
def compression_rate(sent_labels): return sum(sent_labels) / len(sent_labels) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input_path', type=str) parser.add_argument('--output_path', type=str) args = parser.parse_args() print(args) print() random.seed(0) doc_elems = load_elems(args.input_path) sent_elems = [] total = 0 for doc_elem in tqdm(doc_elems): name = doc_elem['name'] part = doc_elem['part'] doc_list = doc_elem['doc_list'] tree_list = doc_elem['doc_parse'] label_list = doc_elem['label_list'] # Find candidate sentences that meet compression threshold candidate_sents = [ (sent_list, sent_parse, sent_labels)