def test_remove_stopwords_method(): """Check the remove_stopwords method.""" tokenizer = Tokenizer("test_data/stopwords.txt", None) stopwords = tokenizer.raw_stopwords assert stopwords expected = {"i", "me", "our", "he", "she"} assert expected <= set(stopwords) # remove some stopwords and check again stopwords = tokenizer.remove_stopwords( ["foo", "something", "me", "our", "bar"]) expected = {"foo", "bar"} assert expected == set(stopwords) # remove some stopwords and check again stopwords = tokenizer.remove_stopwords(["foo", "0", "123", "6502", "bar"]) expected = {"foo", "bar"} assert expected == set(stopwords) # remove some stopwords and check again stopwords = tokenizer.remove_stopwords( ["foo", "-0", "-123", "-6502", "bar"]) expected = {"foo", "bar", "-0", "-123", "-6502"} assert expected == set(stopwords)
def test_tokenize_error_handling(_mock): """Check the tokenize method.""" tokenizer = Tokenizer("test_data/stopwords.txt", 2) content = "The prerequisite for tagging is to collect keywords that are used " + \ "out there by developers.This also means that tagger uses keywords " + \ "that are considered as interesting ones by developers." with pytest.raises(InstallPrepareError): tokenizer.tokenize(content)
def test_tokenize(_mock1, _mock2): """Check the tokenize method.""" tokenizer = Tokenizer("test_data/stopwords.txt", 2) content = "The prerequisite for tagging is to collect keywords that are used " + \ "out there by developers.This also means that tagger uses keywords " + \ "that are considered as interesting ones by developers." results = tokenizer.tokenize(content) assert results
def _prepare_lookup(keywords_file=None, stopwords_file=None, ngram_size=None, lemmatize=False, stemmer=None): # pylint: disable=too-many-arguments """Prepare resources for keywords lookup. :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param ngram_size: size of ngrams, if None, ngram size is computed :param lemmatize: use lemmatizer :type lemmatize: bool :param stemmer: stemmer to be used :type stemmer: str """ stemmer_instance = Stemmer.get_stemmer(stemmer) if stemmer is not None else None lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) computed_ngram_size = chief.compute_ngram_size() if ngram_size is not None and computed_ngram_size > ngram_size: _logger.warning("Computed ngram size (%d) does not reflect supplied ngram size (%d), " "some synonyms will be omitted", chief.compute_ngram_size(), ngram_size) elif ngram_size is None: ngram_size = computed_ngram_size tokenizer = Tokenizer(stopwords_file, ngram_size, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) return ngram_size, tokenizer, chief, CoreParser()
def reckon(keywords_file=None, stopwords_file=None, stemmer=None, lemmatize=False): """Compute keywords and stopwords based on stemmer and lemmatizer configuration. :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param stemmer: stemmer to be used :param lemmatize: True if lemmatization should be done :return: computed keywords and stopwords, duplicit entries are not removed """ result = dict.fromkeys(('keywords', 'stopwords')) stemmer_instance = Stemmer.get_stemmer( stemmer) if stemmer is not None else None lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) tokenizer = Tokenizer(stopwords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) result['keywords'] = chief.keywords result['stopwords'] = sorted(tokenizer.raw_stopwords) + sorted( tokenizer.regexp_stopwords) return result
def test_raw_stopwords_property(): """Check the raw_stopwords property.""" tokenizer = Tokenizer("test_data/stopwords.txt", None) stopwords = tokenizer.raw_stopwords assert stopwords expected = {"i", "me", "our", "he", "she"} # subset operation assert expected <= set(stopwords)
def test_stopwords_reading(): """Check the ability to read stopwords.""" with open("test_data/stopwords.txt", "r") as fin: content = fin.read() bytestream = io.BytesIO(content.encode()) fin = io.TextIOWrapper(bytestream) tokenizer = Tokenizer(fin, None) assert tokenizer with open("test_data/stopwords_bad_re.txt", "r") as fin: content = fin.read() bytestream = io.BytesIO(content.encode()) fin = io.TextIOWrapper(bytestream) tokenizer = Tokenizer(fin, None) assert tokenizer with pytest.raises(InvalidInputError): tokenizer = Tokenizer({}, None)
def lookup(path, keywords_file=None, stopwords_file=None, ignore_errors=False, ngram_size=None, use_progressbar=False, lemmatize=False, stemmer=None): # pylint: disable=too-many-arguments,too-many-locals """Perform keywords lookup. :param path: path of directory tree or file on which the lookup should be done :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped :param ngram_size: size of ngrams, if None, ngram size is computed :param use_progressbar: True if progressbar should be shown :param lemmatize: use lemmatizer :type lemmatize: bool :param stemmer: stemmer to be used :type stemmer: str :return: found keywords, reported per file """ ret = {} stemmer_instance = Stemmer.get_stemmer( stemmer) if stemmer is not None else None lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) computed_ngram_size = chief.compute_ngram_size() if ngram_size is not None and computed_ngram_size > ngram_size: _logger.warning( "Computed ngram size (%d) does not reflect supplied ngram size (%d), " "some synonyms will be omitted", chief.compute_ngram_size(), ngram_size) elif ngram_size is None: ngram_size = computed_ngram_size tokenizer = Tokenizer(stopwords_file, ngram_size, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) for file in progressbarize(iter_files(path, ignore_errors), progress=use_progressbar): _logger.info("Processing file '%s'", file) try: content = CoreParser().parse_file(file) tokens = tokenizer.tokenize(content) # We do not perform any analysis on sentences now, so treat all tokens as one array (sentences of tokens). tokens = chain(*tokens) keywords = chief.extract_keywords(tokens) except Exception as exc: # pylint: disable=broad-except if not ignore_errors: raise _logger.exception("Failed to parse content in file '%s': %s", file, str(exc)) continue ret[file] = keywords return ret
def test_regexp_stopwords_property(): """Check the regexp_stopwords property.""" tokenizer = Tokenizer("test_data/stopwords.txt", None) stopwords = tokenizer.regexp_stopwords assert stopwords assert "re: [0-9.]+" in stopwords
def test_stem_method(): """Check the _stem method.""" tokenizer = Tokenizer("test_data/stopwords.txt", None) # test with no stemmer tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._stem(tokens) assert tokens == ["foo", "bar", "me", "your", "6502"] # test with custom stemmer tokenizer = Tokenizer("test_data/stopwords.txt", stemmer=CustomStemmer()) tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._stem(tokens) assert tokens == ["foo", "bar", "me", "your", "6502"] # test with custom stemmer tokenizer = Tokenizer("test_data/stopwords.txt", stemmer=CustomStemmer2()) tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._stem(tokens) assert tokens == ["*foo", "*bar", "*me", "*your", "*6502"] # test with custom stemmer tokenizer = Tokenizer("test_data/stopwords.txt", stemmer=CustomStemmer3()) tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._stem(tokens) assert tokens == ["***", "***", "***", "***", "***"]
def test_lemmatize_method(): """Check the _lemmatize method.""" tokenizer = Tokenizer("test_data/stopwords.txt", None) # test with no lemmatizer tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._lemmatize(tokens) assert tokens == ["foo", "bar", "me", "your", "6502"] # test with custom lemmatizer tokenizer = Tokenizer("test_data/stopwords.txt", lemmatizer=CustomLemmatizer()) tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._lemmatize(tokens) assert tokens == ["foo", "bar", "me", "your", "6502"] # test with custom lemmatizer tokenizer = Tokenizer("test_data/stopwords.txt", lemmatizer=CustomLemmatizer2()) tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._lemmatize(tokens) assert tokens == ["*foo", "*bar", "*me", "*your", "*6502"] # test with custom lemmatizer tokenizer = Tokenizer("test_data/stopwords.txt", lemmatizer=CustomLemmatizer3()) tokens = ["foo", "bar", "me", "your", "6502"] tokenizer._lemmatize(tokens) assert tokens == ["***", "***", "***", "***", "***"]
def test_initial_state(): """Check the initial state of Tokenizer.""" tokenizer = Tokenizer(None, None) assert tokenizer