Beispiel #1
0
    def test_register_function_warns_when_adding_function_with_same_label(
            self):
        Pipeline.register_function(self.fn, "fn")
        with patch("lunr.pipeline.log") as mock_log:
            Pipeline.register_function(self.fn, "fn")

            mock_log.warning.assert_called_once()
Beispiel #2
0
def register_languages():
    """Register all supported languages to ensure compatibility."""
    for language in set(SUPPORTED_LANGUAGES) - {"en"}:
        language_stemmer = partial(nltk_stemmer,
                                   get_language_stemmer(language))
        Pipeline.register_function(language_stemmer,
                                   "stemmer-{}".format(language))
Beispiel #3
0
    def test_load_with_registered_functions(self):
        serialized_pipeline = ["fn"]
        Pipeline.register_function(fn, "fn")

        pipeline = Pipeline.load(serialized_pipeline)

        assert len(pipeline) == 1
        assert pipeline._stack[0] == fn
Beispiel #4
0
def generate_stop_word_filter(stop_words, language=None):
    """Builds a stopWordFilter function from the provided list of stop words.

    The built in `stop_word_filter` is built using this factory and can be used
    to generate custom `stop_word_filter` for applications or non English
    languages.
    """
    def stop_word_filter(token, i=None, tokens=None):
        if token and str(token) not in stop_words:
            return token

    # camelCased for for compatibility with lunr.js
    label = ("stopWordFilter-{}".format(language)
             if language is not None else "stopWordFilter")
    Pipeline.register_function(stop_word_filter, label)
    return stop_word_filter
Beispiel #5
0
def test_add_token_metadata():
    builder = get_default_builder()

    def token_length(token, i, tokens):
        token.metadata["token_length"] = len(str(token))
        return token

    Pipeline.register_function(token_length)
    builder.pipeline.add(token_length)
    builder.metadata_whitelist.append("token_length")

    idx = lunr("id", ("title", "body"), documents, builder=builder)

    [result, _, _] = idx.search("green")
    assert result["match_data"].metadata["green"]["title"]["token_length"] == [
        5
    ]
    assert result["match_data"].metadata["green"]["body"]["token_length"] == [
        5, 5
    ]
Beispiel #6
0
def get_nltk_builder(languages):
    """Returns a builder with stemmers for all languages added to it.

    Args:
        languages (list): A list of supported languages.
    """
    #all_stemmers = []
    all_stopwords_filters = []
    all_word_characters = set()

    for language in languages:
        if language == "en":
            # use Lunr's defaults
            #all_stemmers.append(lunr.stemmer.stemmer)
            all_stopwords_filters.append(stop_word_filter)
            all_word_characters.update({r"\w"})
        else:
            stopwords, word_characters = _get_stopwords_and_word_characters(language)
            #all_stemmers.append(
            #    Pipeline.registered_functions["stemmer-{}".format(language)]
            #)
            all_stopwords_filters.append(
                generate_stop_word_filter(stopwords, language=language)
            )
            all_word_characters.update(word_characters)

    builder = Builder()
    multi_trimmer = generate_trimmer("".join(sorted(all_word_characters)))
    Pipeline.register_function(
        multi_trimmer, "lunr-multi-trimmer-{}".format("-".join(languages))
    )
    builder.pipeline.reset()

    for fn in chain([multi_trimmer], all_stopwords_filters):#, all_stemmers):
        builder.pipeline.add(fn)
    #for fn in all_stemmers:
    #    builder.search_pipeline.add(fn)

    return builder
Beispiel #7
0
    def generate_output(self, writer):
        pages = [self.create_node(x) for x in self.context['articles']]
        path = os.path.join(self.output_path, 'search_index.json')

        pages_to_index = [{
            'id': x['id'],
            'title': x['title'],
            'text': x['text']
        } for x in pages]

        additional_data = {
            x['id']: {
                'url': x['url'],
                'title': x['title'],
                'summary': x['summary'],
            }
            for x in pages
        }

        Pipeline.register_function(special_chars_remover,
                                   'specialCharsRemover')

        bldr = Builder()
        bldr.pipeline.add(trimmer, stop_word_filter, stemmer,
                          special_chars_remover)
        bldr.search_pipeline.add(stemmer)
        bldr.ref('id')
        bldr.field('title', 10)
        bldr.field('text')

        for page in pages_to_index:
            bldr.add(page)
        idx = bldr.build().serialize()

        with open(path, 'w') as idxfile:
            json.dump({
                'index': idx,
                'data': additional_data,
            }, idxfile)
Beispiel #8
0
        # With this line, strings of length 1 or 2 don't go through the
        # stemming process, although no mention is made of this in the
        # published algorithm. Remove the line to match the published
        # algorithm.

        self.step1ab()
        self.step1c()
        self.step2()
        self.step3()
        self.step4()
        self.step5()
        return self.b[self.k0:self.k + 1]


porter_stemmer = PorterStemmer()


def stemmer(token, i=None, tokens=None):
    """Wrapper around the PorterStemmer for inclusion in pipeline.

    Args:
        language (str): ISO-639-1 code of the language.
        token (lunr.Token): The token to stem.
        i (int): The index of the token in a set.
        tokens (list): A list of tokens representing the set.
    """
    return token.update(porter_stemmer.stem)


Pipeline.register_function(stemmer, "stemmer")
Beispiel #9
0
import re

from lunr.pipeline import Pipeline

full_re = re.compile(r"^\W*?([^\W]+)\W*?$")


def trimmer(token, i=None, tokens=None):
    def trim(s, metadata=None):
        match = full_re.match(s)
        if match is None:
            return s
        return match.group(1)

    return token.update(trim)


Pipeline.register_function(trimmer, "trimmer")
Beispiel #10
0
    def test_register_function_adds_function_to_list_of_registered_functions(
            self):
        Pipeline.register_function(self.fn, "fn")

        assert Pipeline.registered_functions["fn"] == self.fn
Beispiel #11
0
    def test_register_function_adds_a_label_property_to_the_function(self):
        Pipeline.register_function(self.fn, "fn")

        assert self.fn.label == "fn"
Beispiel #12
0
    def test_serialize_returns_array_of_registered_function_labels(self):
        Pipeline.register_function(fn, "fn")
        self.pipeline.add(fn)

        assert self.pipeline.serialize() == ["fn"]
        assert repr(self.pipeline) == '<Pipeline stack="fn">'
Beispiel #13
0
    def test_register_function_adds_defaults_to_name_of_the_function(self):
        Pipeline.register_function(self.fn)

        assert self.fn.label == self.fn.__name__