Exemple #1
0
def _process_batch(parallel: Parallel, batch: List[Dict[str, List[str]]],
                   python_rouge: PythonRouge, out: JsonlWriter) -> None:
    jobs = []
    documents = []
    for instance in batch:
        document = [
            sentence for document in instance['documents']
            for paragraph in document['paragraphs'] for sentence in paragraph
        ]
        cloze = instance['cloze']
        job = delayed(get_greedy_oracle_summary)(document, [cloze],
                                                 R1_RECALL,
                                                 use_porter_stemmer=True,
                                                 remove_stopwords=False,
                                                 python_rouge=python_rouge)
        jobs.append(job)
        documents.append(document)

    results = parallel(jobs)
    for instance, document, (_, labels) in zip(batch, documents, results):
        id_ = instance['id']
        page_title = instance['page_title']
        headings = instance['headings']
        topics = [page_title] + headings
        context = instance['left_context']
        cloze = instance['cloze']
        output_data = {
            'id': id_,
            'topics': topics,
            'document': document,
            'context': context,
            'cloze': cloze,
            'labels': labels
        }
        out.write(output_data)
Exemple #2
0
def main(args):
    dfs, num_documents, avg_document_length = load_dfs(args.df_jsonl)

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                context = instance['context']
                context_tokens = set(token.lower() for sentence in context
                                     for token in sentence.split())
                document = instance['document']

                bm25_scores = []
                for sentence in document:
                    tokenized_sentence = [
                        token.lower() for token in sentence.split()
                    ]
                    bm25 = calculate_bm25(context_tokens, tokenized_sentence,
                                          dfs, num_documents,
                                          avg_document_length, args.k, args.b)

                    bm25_scores.append((bm25, sentence))

                cloze = get_cloze(bm25_scores, args.max_words,
                                  args.max_sentences, args.flatten)
                out.write({'cloze': cloze})
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                labels = instance['labels']
                summary = [document[index] for index in labels]
                out.write({'summary': summary})
def save_data(instances: Dict[str, Any], urls: List[str], file_path: str) -> None:
    with JsonlWriter(file_path) as out:
        for url in tqdm(urls, desc=f'Saving instances to {file_path}'):
            hash_ = get_url_hash(url)
            if hash_ in instances:
                instance = instances[hash_]
                instance['url'] = url
                out.write(instance)
Exemple #5
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.source_jsonl) as source:
            with JsonlReader(args.target_jsonl) as target:
                for source_instance, target_instance in zip(source, target):
                    for source_field, target_field in args.field_names:
                        target_instance[target_field] = source_instance[
                            source_field]
                    out.write(target_instance)
Exemple #6
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with open(args.input_tsv, 'r') as f:
            for line in f:
                line = line.strip()
                line = line.replace('<t>', '').replace('</t>', '')
                line = ' '.join(line.split())
                summary = [line]
                out.write({'summary': summary})
Exemple #7
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                summary = get_lead_summary(document,
                                           max_sentences=args.max_sentences,
                                           max_tokens=args.max_tokens,
                                           max_bytes=args.max_bytes)
                out.write({'summary': summary})
Exemple #8
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                labels = instance['labels']
                cloze = [document[index] for index in labels]
                if not args.keep_sentences:
                    cloze = ' '.join(cloze)
                out.write({args.field_name: cloze})
Exemple #9
0
def _process_batch(parallel: Parallel, batch: List[Dict[str, List[str]]],
                   max_tokens: int, python_rouge: PythonRouge,
                   out: JsonlWriter) -> None:
    jobs = []
    for instance in batch:
        document = instance['document']
        summary = instance['summary']
        job = delayed(get_greedy_oracle_summary)(document,
                                                 summary,
                                                 R1_RECALL,
                                                 max_tokens=max_tokens,
                                                 use_porter_stemmer=True,
                                                 remove_stopwords=True,
                                                 python_rouge=python_rouge)
        jobs.append(job)

    results = parallel(jobs)
    for instance, (_, labels) in zip(batch, results):
        instance['labels'] = labels
        out.write(instance)
Exemple #10
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                cloze = get_lead_summary(document,
                                         max_sentences=args.max_sentences,
                                         max_tokens=args.max_tokens,
                                         max_bytes=args.max_bytes)
                if not args.keep_sentences:
                    cloze = ' '.join(cloze)
                out.write({args.field_name: cloze})
Exemple #11
0
def main(args):
    if args.backend == 'spacy':
        nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
    elif args.backend == 'nltk':
        nlp = nltk.word_tokenize

    with JsonlWriter(args.output_file) as out:
        with JsonlReader(args.input_file) as f:
            for instance in tqdm(f, desc=f'Tokenizing {args.input_file}'):
                for field in args.fields:
                    instance[field] = tokenize(nlp, instance[field])
                out.write(instance)
Exemple #12
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with open(args.src_tsv, 'r') as f_src:
            with open(args.tgt_tsv, 'r') as f_tgt:
                for src, tgt in zip(f_src, f_tgt):
                    if len(src.strip()) == 0:
                        continue

                    document = [src.strip()]
                    summary = []
                    for match in re.findall(r'<t> (.+?) </t>', tgt):
                        summary.append(match)
                    out.write({'document': document, 'summary': summary})
    def test_bz2_file(self):
        # Write the data to a file
        temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl.bz2')
        with JsonlWriter(temp_file.name) as out:
            for item in self.data:
                out.write(item)

        # Load from file, ensure it is correct
        actual_data = []
        with bz2.open(temp_file.name, 'rb') as f:
            for line in f:
                actual_data.append(json.loads(line.decode()))
        self.assertEqual(self.data, actual_data)
Exemple #14
0
def main(args):
    python_rouge = PythonRouge()
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            with Parallel(n_jobs=args.num_cores) as parallel:
                batch = []
                for instance in tqdm(f):
                    batch.append(instance)
                    if len(batch) == _BATCH_SIZE:
                        _process_batch(parallel, batch, python_rouge, out)
                        batch.clear()

                if batch:
                    _process_batch(parallel, batch, python_rouge, out)
Exemple #15
0
def main(args):
    dirname = os.path.dirname(args.output_jsonl)
    if dirname:
        os.makedirs(dirname, exist_ok=True)

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                document = instance['document']
                topics = instance['topics']
                context = instance['context']
                cloze = run_sumfocus(document, topics, context, args.beta,
                                     args.topic_lambda, args.context_lambda,
                                     args.max_words, args.max_sentences)
                cloze = ' '.join(cloze)
                out.write({'cloze': cloze})
Exemple #16
0
def main(args):
    python_rouge = PythonRouge()

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                summary = instance['summary']
                _, labels = get_greedy_oracle_summary(document, summary, args.metric,
                                                      max_sentences=args.max_sentences,
                                                      max_tokens=args.max_tokens,
                                                      max_bytes=args.max_bytes,
                                                      use_porter_stemmer=args.use_stemmer,
                                                      remove_stopwords=args.remove_stopwords,
                                                      python_rouge=python_rouge)
                instance['labels'] = labels
                out.write(instance)
Exemple #17
0
def main(args):
    model_dir = args.model_dir
    length = args.length
    temperature = args.temperature
    top_k = args.top_k
    seed = args.seed

    lm = OpenAILanguageModel(model_dir, length, temperature, top_k, seed=seed)
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                context = instance['context']
                context = ' '.join(context)

                first_sentence = lm.sample_next_sentence(context)
                output_data = {'cloze': first_sentence}
                out.write(output_data)
Exemple #18
0
def main(args):
    nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                instance['headings'] = [
                    tokenize(nlp, heading) for heading in instance['headings']
                ]
                for document in instance['documents']:
                    if document['title']:
                        document['title'] = tokenize(nlp, document['title'])
                    document['paragraphs'] = tokenize(nlp,
                                                      document['paragraphs'])

                instance['left_context'] = tokenize(nlp,
                                                    instance['left_context'])
                instance['cloze'] = tokenize(nlp, instance['cloze'])
                instance['right_context'] = tokenize(nlp,
                                                     instance['right_context'])
                out.write(instance)
Exemple #19
0
def main(args):
    dfs = Counter()
    total_document_length = 0
    num_documents = 0

    with JsonlReader(args.input_jsonl) as f:
        for instance in tqdm(f, desc='Calculating document frequencies'):
            document = instance['document']
            for sentence in document:
                tokens = sentence.lower().split()
                total_document_length += len(tokens)
                num_documents += 1
                for token in set(tokens):
                    dfs[token] += 1

    average_document_length = total_document_length / num_documents
    with JsonlWriter(args.output_jsonl) as out:
        out.write({
            'num_documents': num_documents,
            'average_document_length': average_document_length
        })
        for token, df in dfs.items():
            out.write({'token': token, 'df': df})
Exemple #20
0
def main(args):
    python_rouge = PythonRouge()

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                cloze = instance['cloze']
                oracle, labels = get_greedy_oracle_summary(
                    document, [cloze],
                    args.metric,
                    max_sentences=args.max_sentences,
                    max_tokens=args.max_tokens,
                    max_bytes=args.max_bytes,
                    use_porter_stemmer=args.use_stemmer,
                    remove_stopwords=args.remove_stopwords,
                    python_rouge=python_rouge)
                if args.cloze_only:
                    oracle = ' '.join(oracle)
                    out.write({'cloze': oracle})
                else:
                    instance['labels'] = labels
                    out.write(instance)
Exemple #21
0
def save_data(data: List[Dict[str, List[str]]], file_path: str) -> None:
    with JsonlWriter(file_path) as out:
        for item in tqdm(data, desc=f'Writing instances to {file_path}'):
            out.write(item)