def test_get_stemmer_positive(): """Test for the method get_stemmer().""" stemmer = Stemmer.get_stemmer("LancasterStemmer") assert isinstance(stemmer, nltk.stem.LancasterStemmer) stemmer = Stemmer.get_stemmer("PorterStemmer") assert isinstance(stemmer, nltk.stem.PorterStemmer) stemmer = Stemmer.get_stemmer("EnglishStemmer") assert isinstance(stemmer, nltk.stem.snowball.EnglishStemmer)
def test_get_stemmer_negative(): """Test for the method get_stemmer().""" with pytest.raises(StemmerNotFoundError): stemmer = Stemmer.get_stemmer("unknown") print(stemmer) with pytest.raises(StemmerNotFoundError): stemmer = Stemmer.get_stemmer("") print(stemmer) with pytest.raises(StemmerNotFoundError): stemmer = Stemmer.get_stemmer(None) print(stemmer)
def _prepare_lookup(keywords_file=None, stopwords_file=None, ngram_size=None, lemmatize=False, stemmer=None): # pylint: disable=too-many-arguments """Prepare resources for keywords lookup. :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param ngram_size: size of ngrams, if None, ngram size is computed :param lemmatize: use lemmatizer :type lemmatize: bool :param stemmer: stemmer to be used :type stemmer: str """ stemmer_instance = Stemmer.get_stemmer(stemmer) if stemmer is not None else None lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) computed_ngram_size = chief.compute_ngram_size() if ngram_size is not None and computed_ngram_size > ngram_size: _logger.warning("Computed ngram size (%d) does not reflect supplied ngram size (%d), " "some synonyms will be omitted", chief.compute_ngram_size(), ngram_size) elif ngram_size is None: ngram_size = computed_ngram_size tokenizer = Tokenizer(stopwords_file, ngram_size, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) return ngram_size, tokenizer, chief, CoreParser()
def reckon(keywords_file=None, stopwords_file=None, stemmer=None, lemmatize=False): """Compute keywords and stopwords based on stemmer and lemmatizer configuration. :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param stemmer: stemmer to be used :param lemmatize: True if lemmatization should be done :return: computed keywords and stopwords, duplicit entries are not removed """ result = dict.fromkeys(('keywords', 'stopwords')) stemmer_instance = Stemmer.get_stemmer( stemmer) if stemmer is not None else None lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) tokenizer = Tokenizer(stopwords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) result['keywords'] = chief.keywords result['stopwords'] = sorted(tokenizer.raw_stopwords) + sorted( tokenizer.regexp_stopwords) return result
def test_get_registered_stemmers(): """Test for the class method get_registered_stemmers().""" stemmers = Stemmer.get_registered_stemmers() assert stemmers # we expected at least three stemmers to be registered assert len(stemmers) >= 3
def get_registered_stemmers(): """Get all stemmers that are available in NLTK.""" return Stemmer.get_registered_stemmers()
def lookup(path, keywords_file=None, stopwords_file=None, ignore_errors=False, ngram_size=None, use_progressbar=False, lemmatize=False, stemmer=None): # pylint: disable=too-many-arguments,too-many-locals """Perform keywords lookup. :param path: path of directory tree or file on which the lookup should be done :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped :param ngram_size: size of ngrams, if None, ngram size is computed :param use_progressbar: True if progressbar should be shown :param lemmatize: use lemmatizer :type lemmatize: bool :param stemmer: stemmer to be used :type stemmer: str :return: found keywords, reported per file """ ret = {} stemmer_instance = Stemmer.get_stemmer( stemmer) if stemmer is not None else None lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) computed_ngram_size = chief.compute_ngram_size() if ngram_size is not None and computed_ngram_size > ngram_size: _logger.warning( "Computed ngram size (%d) does not reflect supplied ngram size (%d), " "some synonyms will be omitted", chief.compute_ngram_size(), ngram_size) elif ngram_size is None: ngram_size = computed_ngram_size tokenizer = Tokenizer(stopwords_file, ngram_size, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) for file in progressbarize(iter_files(path, ignore_errors), progress=use_progressbar): _logger.info("Processing file '%s'", file) try: content = CoreParser().parse_file(file) tokens = tokenizer.tokenize(content) # We do not perform any analysis on sentences now, so treat all tokens as one array (sentences of tokens). tokens = chain(*tokens) keywords = chief.extract_keywords(tokens) except Exception as exc: # pylint: disable=broad-except if not ignore_errors: raise _logger.exception("Failed to parse content in file '%s': %s", file, str(exc)) continue ret[file] = keywords return ret