Example #1
0
def _make_sequence(
        processor: Processor,
        corpus: ProviderBase,
        set_type: SetType = SetType.DEV) -> Sequence[DocumentForEval]:
    for d in tqdm(corpus.subset(set_type), total=corpus.subset_size(set_type)):
        summary_size = SummarySize.new_absolute(len(d.ref_summary)) \
            if corpus.purpose() & CorpusPurpose.SUMMARY \
            else SummarySize.new_relative(.1)
        kw_num = len(
            retrieve_lemmatized_tokens(corpus.language(), d.ref_keywords)
        ) if corpus.purpose() & CorpusPurpose.KEYWORDS else 0
        text_process_params = TextProcessParams(summary_size, kw_num)
        summary = processor.process_text(d.text, text_process_params)
        if summary.errors:
            print(
                F"Found errors during processing document '{d.id_}'. Skipped.")
            print("First:", summary.errors[0])
            break
        if summary.warnings:
            print(
                f"Found in document {d.id_} warnings during processing. First: {summary.warnings[0]}"
            )
        res = DocumentForEval(d.ref_keywords,
                              [kw.lemma
                               for kw in summary.keywords], d.ref_summary,
                              [s.lemma for s in summary.summary],
                              corpus.language())
        yield res
def test_TextProcessParams_str():
    params = TextProcessParams(summary_size=SummarySize.new_relative(0.1),
                               keywords_number=0)
    s = str(params)
    assert s != ''
    params = TextProcessParams(summary_size=SummarySize.new_absolute(10),
                               keywords_number=0)
    s = str(params)
    assert s != ''
Example #3
0
def test_digest_processor_fr():
    processor = Processor()
    text = udhr.raw('French_Francais-Latin1')
    text_process_params = TextProcessParams(SummarySize.new_relative(0.1),
                                            keywords_number=10)
    document = processor.process_text(text, text_process_params)
    assert isinstance(document, Document)
    assert 5 <= len(document.sentences)
Example #4
0
def test_digest_processor_de():
    processor = Processor()
    # text = open(path.join(__location__, 'de_text.txt'), 'r', encoding='utf8').read()
    text = udhr.raw('German_Deutsch-Latin1')
    text_process_params = TextProcessParams(SummarySize.new_absolute(3),
                                            keywords_number=10)
    document = processor.process_text(text, text_process_params)
    assert isinstance(document, Document)
    assert 5 <= len(document.sentences)
Example #5
0
def main():
    processor_summa = Processor([SummaCore()])
    processor_em = Processor([EmCoresWrapper()])
    corpus = Krapivin2009Provider()
    sample = corpus.document_by_id(corpus.ids_train[74])

    process_params = TextProcessParams(
        SummarySize.new_absolute(len(sample.ref_summary)),
        len(sample.ref_keywords))
    print(process_params)
    print_reference(sample)

    process_and_report(sample.text, process_params, processor_summa,
                       'SummaCore')
    process_and_report(sample.text, process_params, processor_em,
                       'EmCoresWrapper')
def test_TextProcessParams_creation():
    params = TextProcessParams(summary_size=SummarySize.new_relative(0.1),
                               keywords_number=0)
    params = TextProcessParams(summary_size=SummarySize.new_relative(0.1),
                               keywords_number=10)
def test_TextProcessParams_keywords_negative():
    with pytest.raises(ValueError):
        TextProcessParams(summary_size=SummarySize.new_relative(0.1),
                          keywords_number=-1)