Esempio n. 1
0
def test_remove_stopwords_method():
    """Check the remove_stopwords method."""
    tokenizer = Tokenizer("test_data/stopwords.txt", None)

    stopwords = tokenizer.raw_stopwords
    assert stopwords
    expected = {"i", "me", "our", "he", "she"}
    assert expected <= set(stopwords)

    # remove some stopwords and check again
    stopwords = tokenizer.remove_stopwords(
        ["foo", "something", "me", "our", "bar"])
    expected = {"foo", "bar"}
    assert expected == set(stopwords)

    # remove some stopwords and check again
    stopwords = tokenizer.remove_stopwords(["foo", "0", "123", "6502", "bar"])
    expected = {"foo", "bar"}
    assert expected == set(stopwords)

    # remove some stopwords and check again
    stopwords = tokenizer.remove_stopwords(
        ["foo", "-0", "-123", "-6502", "bar"])
    expected = {"foo", "bar", "-0", "-123", "-6502"}
    assert expected == set(stopwords)
Esempio n. 2
0
def test_tokenize_error_handling(_mock):
    """Check the tokenize method."""
    tokenizer = Tokenizer("test_data/stopwords.txt", 2)
    content = "The prerequisite for tagging is to collect keywords that are used " + \
              "out there by developers.This also means that tagger uses keywords " + \
              "that are considered as interesting ones by developers."
    with pytest.raises(InstallPrepareError):
        tokenizer.tokenize(content)
Esempio n. 3
0
def test_tokenize(_mock1, _mock2):
    """Check the tokenize method."""
    tokenizer = Tokenizer("test_data/stopwords.txt", 2)
    content = "The prerequisite for tagging is to collect keywords that are used " + \
              "out there by developers.This also means that tagger uses keywords " + \
              "that are considered as interesting ones by developers."
    results = tokenizer.tokenize(content)
    assert results
Esempio n. 4
0
def _prepare_lookup(keywords_file=None, stopwords_file=None, ngram_size=None, lemmatize=False,
                    stemmer=None):
    # pylint: disable=too-many-arguments
    """Prepare resources for keywords lookup.

    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param ngram_size: size of ngrams, if None, ngram size is computed
    :param lemmatize: use lemmatizer
    :type lemmatize: bool
    :param stemmer: stemmer to be used
    :type stemmer: str
    """
    stemmer_instance = Stemmer.get_stemmer(stemmer) if stemmer is not None else None
    lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None

    chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance)
    computed_ngram_size = chief.compute_ngram_size()
    if ngram_size is not None and computed_ngram_size > ngram_size:
        _logger.warning("Computed ngram size (%d) does not reflect supplied ngram size (%d), "
                        "some synonyms will be omitted", chief.compute_ngram_size(), ngram_size)
    elif ngram_size is None:
        ngram_size = computed_ngram_size

    tokenizer = Tokenizer(stopwords_file, ngram_size, lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)

    return ngram_size, tokenizer, chief, CoreParser()
Esempio n. 5
0
def reckon(keywords_file=None,
           stopwords_file=None,
           stemmer=None,
           lemmatize=False):
    """Compute keywords and stopwords based on stemmer and lemmatizer configuration.

    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param stemmer: stemmer to be used
    :param lemmatize: True if lemmatization should be done
    :return: computed keywords and stopwords, duplicit entries are not removed
    """
    result = dict.fromkeys(('keywords', 'stopwords'))

    stemmer_instance = Stemmer.get_stemmer(
        stemmer) if stemmer is not None else None
    lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None

    chief = KeywordsChief(keywords_file,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)
    tokenizer = Tokenizer(stopwords_file,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)

    result['keywords'] = chief.keywords
    result['stopwords'] = sorted(tokenizer.raw_stopwords) + sorted(
        tokenizer.regexp_stopwords)

    return result
Esempio n. 6
0
def test_raw_stopwords_property():
    """Check the raw_stopwords property."""
    tokenizer = Tokenizer("test_data/stopwords.txt", None)
    stopwords = tokenizer.raw_stopwords
    assert stopwords
    expected = {"i", "me", "our", "he", "she"}
    # subset operation
    assert expected <= set(stopwords)
Esempio n. 7
0
def test_stopwords_reading():
    """Check the ability to read stopwords."""
    with open("test_data/stopwords.txt", "r") as fin:
        content = fin.read()
        bytestream = io.BytesIO(content.encode())
        fin = io.TextIOWrapper(bytestream)
        tokenizer = Tokenizer(fin, None)
        assert tokenizer

    with open("test_data/stopwords_bad_re.txt", "r") as fin:
        content = fin.read()
        bytestream = io.BytesIO(content.encode())
        fin = io.TextIOWrapper(bytestream)
        tokenizer = Tokenizer(fin, None)
        assert tokenizer

    with pytest.raises(InvalidInputError):
        tokenizer = Tokenizer({}, None)
Esempio n. 8
0
def lookup(path,
           keywords_file=None,
           stopwords_file=None,
           ignore_errors=False,
           ngram_size=None,
           use_progressbar=False,
           lemmatize=False,
           stemmer=None):
    # pylint: disable=too-many-arguments,too-many-locals
    """Perform keywords lookup.

    :param path: path of directory tree or file on which the lookup should be done
    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped
    :param ngram_size: size of ngrams, if None, ngram size is computed
    :param use_progressbar: True if progressbar should be shown
    :param lemmatize: use lemmatizer
    :type lemmatize: bool
    :param stemmer: stemmer to be used
    :type stemmer: str
    :return: found keywords, reported per file
    """
    ret = {}

    stemmer_instance = Stemmer.get_stemmer(
        stemmer) if stemmer is not None else None
    lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None

    chief = KeywordsChief(keywords_file,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)
    computed_ngram_size = chief.compute_ngram_size()
    if ngram_size is not None and computed_ngram_size > ngram_size:
        _logger.warning(
            "Computed ngram size (%d) does not reflect supplied ngram size (%d), "
            "some synonyms will be omitted", chief.compute_ngram_size(),
            ngram_size)
    elif ngram_size is None:
        ngram_size = computed_ngram_size

    tokenizer = Tokenizer(stopwords_file,
                          ngram_size,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)

    for file in progressbarize(iter_files(path, ignore_errors),
                               progress=use_progressbar):
        _logger.info("Processing file '%s'", file)
        try:
            content = CoreParser().parse_file(file)
            tokens = tokenizer.tokenize(content)
            # We do not perform any analysis on sentences now, so treat all tokens as one array (sentences of tokens).
            tokens = chain(*tokens)
            keywords = chief.extract_keywords(tokens)
        except Exception as exc:  # pylint: disable=broad-except
            if not ignore_errors:
                raise
            _logger.exception("Failed to parse content in file '%s': %s", file,
                              str(exc))
            continue

        ret[file] = keywords

    return ret
Esempio n. 9
0
def test_regexp_stopwords_property():
    """Check the regexp_stopwords property."""
    tokenizer = Tokenizer("test_data/stopwords.txt", None)
    stopwords = tokenizer.regexp_stopwords
    assert stopwords
    assert "re: [0-9.]+" in stopwords
Esempio n. 10
0
def test_stem_method():
    """Check the _stem method."""
    tokenizer = Tokenizer("test_data/stopwords.txt", None)

    # test with no stemmer
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._stem(tokens)
    assert tokens == ["foo", "bar", "me", "your", "6502"]

    # test with custom stemmer
    tokenizer = Tokenizer("test_data/stopwords.txt", stemmer=CustomStemmer())
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._stem(tokens)
    assert tokens == ["foo", "bar", "me", "your", "6502"]

    # test with custom stemmer
    tokenizer = Tokenizer("test_data/stopwords.txt", stemmer=CustomStemmer2())
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._stem(tokens)
    assert tokens == ["*foo", "*bar", "*me", "*your", "*6502"]

    # test with custom stemmer
    tokenizer = Tokenizer("test_data/stopwords.txt", stemmer=CustomStemmer3())
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._stem(tokens)
    assert tokens == ["***", "***", "***", "***", "***"]
Esempio n. 11
0
def test_lemmatize_method():
    """Check the _lemmatize method."""
    tokenizer = Tokenizer("test_data/stopwords.txt", None)

    # test with no lemmatizer
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._lemmatize(tokens)
    assert tokens == ["foo", "bar", "me", "your", "6502"]

    # test with custom lemmatizer
    tokenizer = Tokenizer("test_data/stopwords.txt",
                          lemmatizer=CustomLemmatizer())
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._lemmatize(tokens)
    assert tokens == ["foo", "bar", "me", "your", "6502"]

    # test with custom lemmatizer
    tokenizer = Tokenizer("test_data/stopwords.txt",
                          lemmatizer=CustomLemmatizer2())
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._lemmatize(tokens)
    assert tokens == ["*foo", "*bar", "*me", "*your", "*6502"]

    # test with custom lemmatizer
    tokenizer = Tokenizer("test_data/stopwords.txt",
                          lemmatizer=CustomLemmatizer3())
    tokens = ["foo", "bar", "me", "your", "6502"]
    tokenizer._lemmatize(tokens)
    assert tokens == ["***", "***", "***", "***", "***"]
Esempio n. 12
0
def test_initial_state():
    """Check the initial state of Tokenizer."""
    tokenizer = Tokenizer(None, None)
    assert tokenizer