def _process_batch(parallel: Parallel, batch: List[Dict[str, List[str]]], python_rouge: PythonRouge, out: JsonlWriter) -> None: jobs = [] documents = [] for instance in batch: document = [ sentence for document in instance['documents'] for paragraph in document['paragraphs'] for sentence in paragraph ] cloze = instance['cloze'] job = delayed(get_greedy_oracle_summary)(document, [cloze], R1_RECALL, use_porter_stemmer=True, remove_stopwords=False, python_rouge=python_rouge) jobs.append(job) documents.append(document) results = parallel(jobs) for instance, document, (_, labels) in zip(batch, documents, results): id_ = instance['id'] page_title = instance['page_title'] headings = instance['headings'] topics = [page_title] + headings context = instance['left_context'] cloze = instance['cloze'] output_data = { 'id': id_, 'topics': topics, 'document': document, 'context': context, 'cloze': cloze, 'labels': labels } out.write(output_data)
def main(args): dfs, num_documents, avg_document_length = load_dfs(args.df_jsonl) with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in tqdm(f): context = instance['context'] context_tokens = set(token.lower() for sentence in context for token in sentence.split()) document = instance['document'] bm25_scores = [] for sentence in document: tokenized_sentence = [ token.lower() for token in sentence.split() ] bm25 = calculate_bm25(context_tokens, tokenized_sentence, dfs, num_documents, avg_document_length, args.k, args.b) bm25_scores.append((bm25, sentence)) cloze = get_cloze(bm25_scores, args.max_words, args.max_sentences, args.flatten) out.write({'cloze': cloze})
def main(args): with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in f: document = instance['document'] labels = instance['labels'] summary = [document[index] for index in labels] out.write({'summary': summary})
def save_data(instances: Dict[str, Any], urls: List[str], file_path: str) -> None: with JsonlWriter(file_path) as out: for url in tqdm(urls, desc=f'Saving instances to {file_path}'): hash_ = get_url_hash(url) if hash_ in instances: instance = instances[hash_] instance['url'] = url out.write(instance)
def main(args): with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.source_jsonl) as source: with JsonlReader(args.target_jsonl) as target: for source_instance, target_instance in zip(source, target): for source_field, target_field in args.field_names: target_instance[target_field] = source_instance[ source_field] out.write(target_instance)
def main(args): with JsonlWriter(args.output_jsonl) as out: with open(args.input_tsv, 'r') as f: for line in f: line = line.strip() line = line.replace('<t>', '').replace('</t>', '') line = ' '.join(line.split()) summary = [line] out.write({'summary': summary})
def main(args): with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in f: document = instance['document'] summary = get_lead_summary(document, max_sentences=args.max_sentences, max_tokens=args.max_tokens, max_bytes=args.max_bytes) out.write({'summary': summary})
def main(args): with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in f: document = instance['document'] labels = instance['labels'] cloze = [document[index] for index in labels] if not args.keep_sentences: cloze = ' '.join(cloze) out.write({args.field_name: cloze})
def _process_batch(parallel: Parallel, batch: List[Dict[str, List[str]]], max_tokens: int, python_rouge: PythonRouge, out: JsonlWriter) -> None: jobs = [] for instance in batch: document = instance['document'] summary = instance['summary'] job = delayed(get_greedy_oracle_summary)(document, summary, R1_RECALL, max_tokens=max_tokens, use_porter_stemmer=True, remove_stopwords=True, python_rouge=python_rouge) jobs.append(job) results = parallel(jobs) for instance, (_, labels) in zip(batch, results): instance['labels'] = labels out.write(instance)
def main(args): with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in f: document = instance['document'] cloze = get_lead_summary(document, max_sentences=args.max_sentences, max_tokens=args.max_tokens, max_bytes=args.max_bytes) if not args.keep_sentences: cloze = ' '.join(cloze) out.write({args.field_name: cloze})
def main(args): if args.backend == 'spacy': nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) elif args.backend == 'nltk': nlp = nltk.word_tokenize with JsonlWriter(args.output_file) as out: with JsonlReader(args.input_file) as f: for instance in tqdm(f, desc=f'Tokenizing {args.input_file}'): for field in args.fields: instance[field] = tokenize(nlp, instance[field]) out.write(instance)
def main(args): with JsonlWriter(args.output_jsonl) as out: with open(args.src_tsv, 'r') as f_src: with open(args.tgt_tsv, 'r') as f_tgt: for src, tgt in zip(f_src, f_tgt): if len(src.strip()) == 0: continue document = [src.strip()] summary = [] for match in re.findall(r'<t> (.+?) </t>', tgt): summary.append(match) out.write({'document': document, 'summary': summary})
def test_bz2_file(self): # Write the data to a file temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl.bz2') with JsonlWriter(temp_file.name) as out: for item in self.data: out.write(item) # Load from file, ensure it is correct actual_data = [] with bz2.open(temp_file.name, 'rb') as f: for line in f: actual_data.append(json.loads(line.decode())) self.assertEqual(self.data, actual_data)
def main(args): python_rouge = PythonRouge() with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: with Parallel(n_jobs=args.num_cores) as parallel: batch = [] for instance in tqdm(f): batch.append(instance) if len(batch) == _BATCH_SIZE: _process_batch(parallel, batch, python_rouge, out) batch.clear() if batch: _process_batch(parallel, batch, python_rouge, out)
def main(args): dirname = os.path.dirname(args.output_jsonl) if dirname: os.makedirs(dirname, exist_ok=True) with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in tqdm(f): document = instance['document'] topics = instance['topics'] context = instance['context'] cloze = run_sumfocus(document, topics, context, args.beta, args.topic_lambda, args.context_lambda, args.max_words, args.max_sentences) cloze = ' '.join(cloze) out.write({'cloze': cloze})
def main(args): python_rouge = PythonRouge() with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in f: document = instance['document'] summary = instance['summary'] _, labels = get_greedy_oracle_summary(document, summary, args.metric, max_sentences=args.max_sentences, max_tokens=args.max_tokens, max_bytes=args.max_bytes, use_porter_stemmer=args.use_stemmer, remove_stopwords=args.remove_stopwords, python_rouge=python_rouge) instance['labels'] = labels out.write(instance)
def main(args): model_dir = args.model_dir length = args.length temperature = args.temperature top_k = args.top_k seed = args.seed lm = OpenAILanguageModel(model_dir, length, temperature, top_k, seed=seed) with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in tqdm(f): context = instance['context'] context = ' '.join(context) first_sentence = lm.sample_next_sentence(context) output_data = {'cloze': first_sentence} out.write(output_data)
def main(args): nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in tqdm(f): instance['headings'] = [ tokenize(nlp, heading) for heading in instance['headings'] ] for document in instance['documents']: if document['title']: document['title'] = tokenize(nlp, document['title']) document['paragraphs'] = tokenize(nlp, document['paragraphs']) instance['left_context'] = tokenize(nlp, instance['left_context']) instance['cloze'] = tokenize(nlp, instance['cloze']) instance['right_context'] = tokenize(nlp, instance['right_context']) out.write(instance)
def main(args): dfs = Counter() total_document_length = 0 num_documents = 0 with JsonlReader(args.input_jsonl) as f: for instance in tqdm(f, desc='Calculating document frequencies'): document = instance['document'] for sentence in document: tokens = sentence.lower().split() total_document_length += len(tokens) num_documents += 1 for token in set(tokens): dfs[token] += 1 average_document_length = total_document_length / num_documents with JsonlWriter(args.output_jsonl) as out: out.write({ 'num_documents': num_documents, 'average_document_length': average_document_length }) for token, df in dfs.items(): out.write({'token': token, 'df': df})
def main(args): python_rouge = PythonRouge() with JsonlWriter(args.output_jsonl) as out: with JsonlReader(args.input_jsonl) as f: for instance in f: document = instance['document'] cloze = instance['cloze'] oracle, labels = get_greedy_oracle_summary( document, [cloze], args.metric, max_sentences=args.max_sentences, max_tokens=args.max_tokens, max_bytes=args.max_bytes, use_porter_stemmer=args.use_stemmer, remove_stopwords=args.remove_stopwords, python_rouge=python_rouge) if args.cloze_only: oracle = ' '.join(oracle) out.write({'cloze': oracle}) else: instance['labels'] = labels out.write(instance)
def save_data(data: List[Dict[str, List[str]]], file_path: str) -> None: with JsonlWriter(file_path) as out: for item in tqdm(data, desc=f'Writing instances to {file_path}'): out.write(item)