Ejemplo n.º 1
0
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:

                try:

                    # Stemmer might raise an exception
                    stem = hausastemmer.stem(word)

                    # ...or it might return an empty string / None
                    if stem is None or len(stem) == 0:
                        raise Exception("Stem is empty.")

                except Exception as ex:
                    log.warning("Unable to stem word '{}': {}".format(word, str(ex)))
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))

        return stems
Ejemplo n.º 2
0
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:
                stem = hausastemmer.stem(word)

                if stem is None or len(stem) == 0:
                    log.debug("Unable to stem word '%s'" % word)
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        return stems
Ejemplo n.º 3
0
def test_stemmer_without_dict_lookup():
    for term, expected_stem in sorted(TEST_STEMS_WITHOUT_DICT_LOOKUP.items()):
        actual_stem = stem(term, lookup=False)
        if expected_stem != actual_stem:
            print(
                "Testing '%s' without dictionary lookup, expecting to get '%s', got '%s'"
                % (term, expected_stem, actual_stem))
        assert actual_stem == expected_stem
Ejemplo n.º 4
0
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:

                try:

                    # Stemmer might raise an exception
                    stem = hausastemmer.stem(word)

                    # ...or it might return an empty string / None
                    if stem is None or len(stem) == 0:
                        raise Exception("Stem is empty.")

                except Exception as ex:
                    log.warning("Unable to stem word '{}': {}".format(
                        word, str(ex)))
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        return stems
Ejemplo n.º 5
0
def test_stemmer_with_bad_data():
    assert stem('') == ''
    assert stem('ą') == 'ą'

    # Won't necessarily return the same string
    assert len(stem('ą' * 1024 * 1024)) > 1024 * 1023
Ejemplo n.º 6
0
def py_hausa_stem(token):
    """Used by Perl code to do Hausa stemming."""
    # MC_REWRITE_TO_PYTHON: simplify after rewriting language module to Python.
    token = decode_object_from_bytes_if_needed(token)
    return hausastemmer.stem(token)
Ejemplo n.º 7
0
def py_hausa_stem(token):
    """Used by Perl code to do Hausa stemming."""
    # FIXME MC_REWRITE_TO_PYTHON: simplify after rewriting language module to Python.
    token = decode_string_from_bytes_if_needed(token)
    return hausastemmer.stem(token)