def test_digest_processor_fr(): processor = Processor() text = udhr.raw('French_Francais-Latin1') text_process_params = TextProcessParams(SummarySize.new_relative(0.1), keywords_number=10) document = processor.process_text(text, text_process_params) assert isinstance(document, Document) assert 5 <= len(document.sentences)
def test_digest_processor_de(): processor = Processor() # text = open(path.join(__location__, 'de_text.txt'), 'r', encoding='utf8').read() text = udhr.raw('German_Deutsch-Latin1') text_process_params = TextProcessParams(SummarySize.new_absolute(3), keywords_number=10) document = processor.process_text(text, text_process_params) assert isinstance(document, Document) assert 5 <= len(document.sentences)
def main(): processor_summa = Processor([SummaCore()]) processor_em = Processor([EmCoresWrapper()]) corpus = Krapivin2009Provider() sample = corpus.document_by_id(corpus.ids_train[74]) process_params = TextProcessParams( SummarySize.new_absolute(len(sample.ref_summary)), len(sample.ref_keywords)) print(process_params) print_reference(sample) process_and_report(sample.text, process_params, processor_summa, 'SummaCore') process_and_report(sample.text, process_params, processor_em, 'EmCoresWrapper')
def _make_sequence( processor: Processor, corpus: ProviderBase, set_type: SetType = SetType.DEV) -> Sequence[DocumentForEval]: for d in tqdm(corpus.subset(set_type), total=corpus.subset_size(set_type)): summary_size = SummarySize.new_absolute(len(d.ref_summary)) \ if corpus.purpose() & CorpusPurpose.SUMMARY \ else SummarySize.new_relative(.1) kw_num = len( retrieve_lemmatized_tokens(corpus.language(), d.ref_keywords) ) if corpus.purpose() & CorpusPurpose.KEYWORDS else 0 text_process_params = TextProcessParams(summary_size, kw_num) summary = processor.process_text(d.text, text_process_params) if summary.errors: print( F"Found errors during processing document '{d.id_}'. Skipped.") print("First:", summary.errors[0]) break if summary.warnings: print( f"Found in document {d.id_} warnings during processing. First: {summary.warnings[0]}" ) res = DocumentForEval(d.ref_keywords, [kw.lemma for kw in summary.keywords], d.ref_summary, [s.lemma for s in summary.summary], corpus.language()) yield res
def evaluate_all_corpora(): processor_baseline = Processor([FallbackCore()]) processor_summa = Processor([SummaCore()]) processor_em = Processor([EmCoresWrapper()]) corpus = WikiHowProvider() metrics = evaluate_processor_on_corpus(processor_em, corpus, "WikiHow / EmCore") print(metrics) metrics = evaluate_processor_on_corpus(processor_baseline, corpus, "WikiHow / FallbackCore") print(metrics) metrics = evaluate_processor_on_corpus(processor_summa, corpus, "WikiHow / SummaCore") print(metrics) corpus = BbcNewsProvider() metrics = evaluate_processor_on_corpus(processor_em, corpus, "BBC News / EmCore") print(metrics) metrics = evaluate_processor_on_corpus(processor_baseline, corpus, "BBC News / FallbackCore") print(metrics) metrics = evaluate_processor_on_corpus(processor_summa, corpus, "BBC News / SummaCore") print(metrics) #corpus = LimitedProvider(Krapivin2009Provider(), 30) corpus = Krapivin2009Provider() metrics = evaluate_processor_on_corpus(processor_em, corpus, "Krapivin2009 / EmCore") print(metrics) metrics = evaluate_processor_on_corpus(processor_baseline, corpus, "Krapivin2009 / FallbackCore") print(metrics) metrics = evaluate_processor_on_corpus(processor_summa, corpus, "Krapivin2009 / SummaCore") print(metrics)