コード例 #1
0
def test_get_stemmer_positive():
    """Test for the method get_stemmer()."""
    stemmer = Stemmer.get_stemmer("LancasterStemmer")
    assert isinstance(stemmer, nltk.stem.LancasterStemmer)

    stemmer = Stemmer.get_stemmer("PorterStemmer")
    assert isinstance(stemmer, nltk.stem.PorterStemmer)

    stemmer = Stemmer.get_stemmer("EnglishStemmer")
    assert isinstance(stemmer, nltk.stem.snowball.EnglishStemmer)
コード例 #2
0
def test_get_stemmer_negative():
    """Test for the method get_stemmer()."""
    with pytest.raises(StemmerNotFoundError):
        stemmer = Stemmer.get_stemmer("unknown")
        print(stemmer)

    with pytest.raises(StemmerNotFoundError):
        stemmer = Stemmer.get_stemmer("")
        print(stemmer)

    with pytest.raises(StemmerNotFoundError):
        stemmer = Stemmer.get_stemmer(None)
        print(stemmer)
コード例 #3
0
def _prepare_lookup(keywords_file=None, stopwords_file=None, ngram_size=None, lemmatize=False,
                    stemmer=None):
    # pylint: disable=too-many-arguments
    """Prepare resources for keywords lookup.

    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param ngram_size: size of ngrams, if None, ngram size is computed
    :param lemmatize: use lemmatizer
    :type lemmatize: bool
    :param stemmer: stemmer to be used
    :type stemmer: str
    """
    stemmer_instance = Stemmer.get_stemmer(stemmer) if stemmer is not None else None
    lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None

    chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance)
    computed_ngram_size = chief.compute_ngram_size()
    if ngram_size is not None and computed_ngram_size > ngram_size:
        _logger.warning("Computed ngram size (%d) does not reflect supplied ngram size (%d), "
                        "some synonyms will be omitted", chief.compute_ngram_size(), ngram_size)
    elif ngram_size is None:
        ngram_size = computed_ngram_size

    tokenizer = Tokenizer(stopwords_file, ngram_size, lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)

    return ngram_size, tokenizer, chief, CoreParser()
コード例 #4
0
def reckon(keywords_file=None,
           stopwords_file=None,
           stemmer=None,
           lemmatize=False):
    """Compute keywords and stopwords based on stemmer and lemmatizer configuration.

    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param stemmer: stemmer to be used
    :param lemmatize: True if lemmatization should be done
    :return: computed keywords and stopwords, duplicit entries are not removed
    """
    result = dict.fromkeys(('keywords', 'stopwords'))

    stemmer_instance = Stemmer.get_stemmer(
        stemmer) if stemmer is not None else None
    lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None

    chief = KeywordsChief(keywords_file,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)
    tokenizer = Tokenizer(stopwords_file,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)

    result['keywords'] = chief.keywords
    result['stopwords'] = sorted(tokenizer.raw_stopwords) + sorted(
        tokenizer.regexp_stopwords)

    return result
コード例 #5
0
def test_get_registered_stemmers():
    """Test for the class method get_registered_stemmers()."""
    stemmers = Stemmer.get_registered_stemmers()
    assert stemmers
    # we expected at least three stemmers to be registered
    assert len(stemmers) >= 3
コード例 #6
0
def get_registered_stemmers():
    """Get all stemmers that are available in NLTK."""
    return Stemmer.get_registered_stemmers()
コード例 #7
0
def lookup(path,
           keywords_file=None,
           stopwords_file=None,
           ignore_errors=False,
           ngram_size=None,
           use_progressbar=False,
           lemmatize=False,
           stemmer=None):
    # pylint: disable=too-many-arguments,too-many-locals
    """Perform keywords lookup.

    :param path: path of directory tree or file on which the lookup should be done
    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped
    :param ngram_size: size of ngrams, if None, ngram size is computed
    :param use_progressbar: True if progressbar should be shown
    :param lemmatize: use lemmatizer
    :type lemmatize: bool
    :param stemmer: stemmer to be used
    :type stemmer: str
    :return: found keywords, reported per file
    """
    ret = {}

    stemmer_instance = Stemmer.get_stemmer(
        stemmer) if stemmer is not None else None
    lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None

    chief = KeywordsChief(keywords_file,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)
    computed_ngram_size = chief.compute_ngram_size()
    if ngram_size is not None and computed_ngram_size > ngram_size:
        _logger.warning(
            "Computed ngram size (%d) does not reflect supplied ngram size (%d), "
            "some synonyms will be omitted", chief.compute_ngram_size(),
            ngram_size)
    elif ngram_size is None:
        ngram_size = computed_ngram_size

    tokenizer = Tokenizer(stopwords_file,
                          ngram_size,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)

    for file in progressbarize(iter_files(path, ignore_errors),
                               progress=use_progressbar):
        _logger.info("Processing file '%s'", file)
        try:
            content = CoreParser().parse_file(file)
            tokens = tokenizer.tokenize(content)
            # We do not perform any analysis on sentences now, so treat all tokens as one array (sentences of tokens).
            tokens = chain(*tokens)
            keywords = chief.extract_keywords(tokens)
        except Exception as exc:  # pylint: disable=broad-except
            if not ignore_errors:
                raise
            _logger.exception("Failed to parse content in file '%s': %s", file,
                              str(exc))
            continue

        ret[file] = keywords

    return ret