Ejemplo n.º 1
0
def build_edmundson(parser, language):
    summarizer = EdmundsonSummarizer(Stemmer(language))
    summarizer.null_words = get_stop_words(language)
    summarizer.bonus_words = parser.significant_words
    summarizer.stigma_words = parser.stigma_words

    return summarizer
def test_article_example():
    """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
    parser = PlaintextParser.from_string(
        load_resource("articles/prevko_cz_1.txt"),
        Tokenizer("czech")
    )
    summarizer = LsaSummarizer(Stemmer("czech"))
    summarizer.stop_words = get_stop_words("czech")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
def test_stemmer():
    summarizer_w_stemmer = _build_summarizer(EMPTY_STOP_WORDS,
                                             Stemmer('english'))
    summarizer_wo_stemmer = _build_summarizer(EMPTY_STOP_WORDS)
    word = Sentence('testing', Tokenizer('english'))
    assert summarizer_w_stemmer._get_content_words_in_sentence(word) == [
        'test'
    ]
    assert summarizer_wo_stemmer._get_content_words_in_sentence(word) == [
        'testing'
    ]
def test_issue_5_svd_converges():
    """Source: https://github.com/miso-belica/sumy/issues/5"""
    pytest.skip("Can't reproduce the issue.")

    parser = PlaintextParser.from_string(
        load_resource("articles/svd_converges.txt"),
        Tokenizer("english")
    )
    summarizer = LsaSummarizer(Stemmer("english"))
    summarizer.stop_words = get_stop_words("english")

    sentences = summarizer(parser.document, 20)
    assert len(sentences) == 20
Ejemplo n.º 5
0
def handle_arguments(args, default_input_stream=sys.stdin):
    document_format = args['--format']
    if document_format is not None and document_format not in PARSERS:
        raise ValueError("Unsupported format of input document. Possible values are: %s. Given: %s." % (
            ", ".join(PARSERS.keys()),
            document_format,
        ))

    if args["--url"] is not None:
        parser = PARSERS[document_format or "html"]
        document_content = fetch_url(args["--url"])
    elif args["--file"] is not None:
        parser = PARSERS[document_format or "plaintext"]
        with open(args["--file"], "rb") as file:
            document_content = file.read()
    elif args["--text"] is not None:
        parser = PARSERS[document_format or "plaintext"]
        document_content = args["--text"]
    else:
        parser = PARSERS[document_format or "plaintext"]
        document_content = default_input_stream.read()

    items_count = ItemsCount(args["--length"])

    language = args["--language"]
    if args["--stopwords"]:
        stop_words = read_stop_words(args["--stopwords"])
    else:
        stop_words = get_stop_words(language)

    parser = parser(document_content, Tokenizer(language))
    stemmer = Stemmer(language)

    summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name])
    summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser)

    return summarizer, parser, items_count
Ejemplo n.º 6
0
def test_empty_document():
    document = build_document()
    summarizer = TextRankSummarizer(Stemmer("english"))

    returned = summarizer(document, 10)
    assert len(returned) == 0
Ejemplo n.º 7
0
def test_slovak_stemmer():
    expected = Stemmer("czech")
    actual = Stemmer("slovak")

    assert type(actual) is type(expected)
    assert expected.__dict__ == actual.__dict__
Ejemplo n.º 8
0
def test_french_stemmer():
    french_stemmer = Stemmer('czech')
    assert "jol" == french_stemmer("jolies")
Ejemplo n.º 9
0
def test_czech_stemmer():
    czech_stemmer = Stemmer('czech')
    assert "pěkn" == czech_stemmer("pěkný")
Ejemplo n.º 10
0
def test_german_stemmer():
    german_stemmer = Stemmer('german')
    assert "sterb" == german_stemmer("sterben")
Ejemplo n.º 11
0
def test_english_stemmer():
    english_stemmer = Stemmer('english')
    assert "beauti" == english_stemmer("beautiful")
Ejemplo n.º 12
0
def test_missing_stemmer_language():
    with pytest.raises(LookupError):
        Stemmer("klingon")
Ejemplo n.º 13
0
def build_sum_basic(parser, language):
    summarizer = SumBasicSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer
Ejemplo n.º 14
0
def build_lex_rank(parser, language):
    summarizer = LexRankSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer
Ejemplo n.º 15
0
def build_lsa(parser, language):
    summarizer = LsaSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)

    return summarizer