Beispiel #1
0
    def test_builder_extracts_nested_properties_from_document(self):
        self.builder = Builder()
        self.builder.field("name", extractor=lambda d: d["person"]["name"])

        self.builder.add({"id": "id", "person": {"name": "bob"}})

        assert self.builder.inverted_index["bob"]["name"]["id"] == {}
Beispiel #2
0
    def setup_method(self, method):
        self.builder = Builder()
        doc = {"id": "id", "title": "test", "body": "missing"}

        self.builder.ref("id")
        self.builder.field("title")
        self.builder.add(doc)
        self.index = self.builder.build()
Beispiel #3
0
    def test_builder_casts_docrefs_to_strings(self):
        self.builder = Builder()
        self.builder.ref("id")
        self.builder.field("title")

        self.builder.add(dict(id=123, title="test", body="missing"))

        _assert_deep_keys(self.builder.inverted_index, "test.title.123")
Beispiel #4
0
 def test_define_fields_to_index(self):
     builder = Builder()
     builder.field("foo")
     assert len(builder._fields) == 1
     assert builder._fields["foo"].name == "foo"
     assert builder._fields["foo"].boost == 1
     assert builder._fields["foo"].extractor is None
     assert repr(builder._fields["foo"]) == '<Field "foo" boost="1">'
     assert hash(builder._fields["foo"]) == hash("foo")
Beispiel #5
0
    def test_builder_field_term_frequency_and_length(self):
        self.builder = Builder()
        self.builder.ref("id")
        self.builder.field("title")

        self.builder.add(dict(id="a", title="test a testing test", body="missing"))

        assert self.builder.field_term_frequencies == {
            "title/a": {"test": 2, "a": 1, "testing": 1}
        }
        assert self.builder.field_lengths == {"title/a": 4}
Beispiel #6
0
    def test_builder_metadata_whitelist_includes_metadata_in_index(self):
        self.builder = Builder()
        self.builder.ref("id")
        self.builder.field("title")
        self.builder.metadata_whitelist = ["position"]

        self.builder.add(dict(id="a", title="test", body="missing"))
        self.builder.add(dict(id="b", title="another test", body="missing"))

        assert self.builder.inverted_index["test"]["title"]["a"] == {
            "position": [[0, 4]]
        }
        assert self.builder.inverted_index["test"]["title"]["b"] == {
            "position": [[8, 4]]
        }
Beispiel #7
0
class TestBuilderBuild:
    def setup_method(self, method):
        self.builder = Builder()
        doc = {"id": "id", "title": "test", "body": "missing"}

        self.builder.ref("id")
        self.builder.field("title")
        self.builder.add(doc)
        self.index = self.builder.build()

    def test_adds_tokens_to_inverted_index(self):
        _assert_deep_keys(self.builder.inverted_index, "test.title.id")

    def test_builds_vector_space_of_the_document_fields(self):
        assert "title/id" in self.builder.field_vectors
        assert isinstance(self.builder.field_vectors["title/id"], Vector)

    def test_skips_fields_not_defined_for_indexing(self):
        assert "missing" not in self.builder.inverted_index

    def test_builds_a_token_set_for_the_corpus(self):
        needle = TokenSet.from_string("test")
        assert "test" in self.builder.token_set.intersect(needle).to_list()

    def test_calculates_document_count(self):
        assert self.builder.average_field_length["title"] == 1

    def test_index_is_returned(self):
        assert isinstance(self.index, Index)
Beispiel #8
0
def lunr_builder(ref, fields):
    """A convenience function to configure and construct a lunr.Builder.

    Returns:
        Index: The populated Index ready to search against.
    """
    builder = Builder()
    builder.pipeline.add(trimmer, stop_word_filter, stemmer)
    builder.search_pipeline.add(stemmer)
    builder.ref(ref)
    for field in fields:
        builder.field(field)
    return builder
Beispiel #9
0
def get_nltk_builder(languages):
    """Returns a builder with stemmers for all languages added to it.

    Args:
        languages (list): A list of supported languages.
    """
    #all_stemmers = []
    all_stopwords_filters = []
    all_word_characters = set()

    for language in languages:
        if language == "en":
            # use Lunr's defaults
            #all_stemmers.append(lunr.stemmer.stemmer)
            all_stopwords_filters.append(stop_word_filter)
            all_word_characters.update({r"\w"})
        else:
            stopwords, word_characters = _get_stopwords_and_word_characters(language)
            #all_stemmers.append(
            #    Pipeline.registered_functions["stemmer-{}".format(language)]
            #)
            all_stopwords_filters.append(
                generate_stop_word_filter(stopwords, language=language)
            )
            all_word_characters.update(word_characters)

    builder = Builder()
    multi_trimmer = generate_trimmer("".join(sorted(all_word_characters)))
    Pipeline.register_function(
        multi_trimmer, "lunr-multi-trimmer-{}".format("-".join(languages))
    )
    builder.pipeline.reset()

    for fn in chain([multi_trimmer], all_stopwords_filters):#, all_stemmers):
        builder.pipeline.add(fn)
    #for fn in all_stemmers:
    #    builder.search_pipeline.add(fn)

    return builder
Beispiel #10
0
def get_default_builder(languages=None):
    """Creates a new pre-configured instance of Builder.

    Useful as a starting point to tweak the defaults.
    """
    if languages is not None and lang.LANGUAGE_SUPPORT:
        if isinstance(languages, str):
            languages = [languages]

        unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES)
        if unsupported_languages:
            raise RuntimeError("The specified languages {} are not supported, "
                               "please choose one of {}".format(
                                   ", ".join(unsupported_languages),
                                   ", ".join(lang.SUPPORTED_LANGUAGES.keys()),
                               ))
        builder = lang.get_nltk_builder(languages)
    else:
        builder = Builder()
        builder.pipeline.add(trimmer, stop_word_filter, stemmer)
        builder.search_pipeline.add(stemmer)

    return builder
Beispiel #11
0
class TestBuilderUse:
    def setup_method(self, method):
        self.builder = Builder()

    def test_calls_plugin_function(self):
        def plugin(*args):
            assert True

        self.builder.use(plugin)

    def test_plugin_is_called_with_builder_as_first_argument(self):
        def plugin(builder):
            assert builder is self.builder

        self.builder.use(plugin)

    def test_forwards_arguments_to_the_plugin(self):
        def plugin(builder, *args, **kwargs):
            assert args == (1, 2, 3)
            assert kwargs == {"foo": "bar"}

        self.builder.use(plugin, 1, 2, 3, foo="bar")
Beispiel #12
0
 def test_b_less_than_zero(self):
     builder = Builder()
     builder.b(-1)
     assert builder._b == 0
Beispiel #13
0
    def test_builder_field_raises_if_contains_slash(self):
        self.builder = Builder()

        with pytest.raises(ValueError):
            self.builder.field("foo/bar")
Beispiel #14
0
def lunr(ref, fields, documents, languages=None):
    """A convenience function to configure and construct a lunr.Index.

    Args:
        ref (str): The key in the documents to be used a the reference.
        fields (list): A list of strings defining fields in the documents to
            index. Optionally a list of dictionaries with three keys:
            `field_name` defining the document's field, `boost` an integer
            defining a boost to be applied to the field, and `extractor`
            a callable taking the document as a single argument and returning
            a string located in the document in a particular way.
        documents (list): The list of dictonaries representing the documents
            to index. Optionally a 2-tuple of dicts, the first one being
            the document and the second the associated attributes to it.
        languages (str or list, optional): The languages to use if using
            NLTK language support, ignored if NLTK is not available.

    Returns:
        Index: The populated Index ready to search against.
    """
    if languages is not None and lang.LANGUAGE_SUPPORT:
        if isinstance(languages, basestring):
            languages = [languages]

        unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES)
        if unsupported_languages:
            raise RuntimeError("The specified languages {} are not supported, "
                               "please choose one of {}".format(
                                   ", ".join(unsupported_languages),
                                   ", ".join(lang.SUPPORTED_LANGUAGES.keys()),
                               ))
        builder = lang.get_nltk_builder(languages)
    else:
        builder = Builder()
        builder.pipeline.add(trimmer, stop_word_filter)  #, stemmer)
        #builder.search_pipeline.add(stemmer)

    builder.ref(ref)
    for field in fields:
        if isinstance(field, dict):
            builder.field(**field)
        else:
            builder.field(field)

    for document in documents:
        if isinstance(document, (tuple, list)):
            builder.add(document[0], attributes=document[1])
        else:
            builder.add(document)

    return builder.build()
Beispiel #15
0
class TestBuilderAdd:
    def test_builder_casts_docrefs_to_strings(self):
        self.builder = Builder()
        self.builder.ref("id")
        self.builder.field("title")

        self.builder.add(dict(id=123, title="test", body="missing"))

        _assert_deep_keys(self.builder.inverted_index, "test.title.123")

    def test_builder_metadata_whitelist_includes_metadata_in_index(self):
        self.builder = Builder()
        self.builder.ref("id")
        self.builder.field("title")
        self.builder.metadata_whitelist = ["position"]

        self.builder.add(dict(id="a", title="test", body="missing"))
        self.builder.add(dict(id="b", title="another test", body="missing"))

        assert self.builder.inverted_index["test"]["title"]["a"] == {
            "position": [[0, 4]]
        }
        assert self.builder.inverted_index["test"]["title"]["b"] == {
            "position": [[8, 4]]
        }

    def test_builder_field_raises_if_contains_slash(self):
        self.builder = Builder()

        with pytest.raises(ValueError):
            self.builder.field("foo/bar")

    def test_builder_extracts_nested_properties_from_document(self):
        self.builder = Builder()
        self.builder.field("name", extractor=lambda d: d["person"]["name"])

        self.builder.add({"id": "id", "person": {"name": "bob"}})

        assert self.builder.inverted_index["bob"]["name"]["id"] == {}

    def test_builder_field_term_frequency_and_length(self):
        self.builder = Builder()
        self.builder.ref("id")
        self.builder.field("title")

        self.builder.add(
            dict(id="a", title="test a testing test", body="missing"))

        assert self.builder.field_term_frequencies == {
            "title/a": {
                "test": 2,
                "a": 1,
                "testing": 1
            }
        }
        assert self.builder.field_lengths == {"title/a": 4}
Beispiel #16
0
 def test_k1_default_value(self):
     builder = Builder()
     assert builder._k1 == 1.2
Beispiel #17
0
 def setup_method(self, method):
     self.builder = Builder()
Beispiel #18
0
 def test_default_reference(self):
     builder = Builder()
     assert builder._ref == "id"
Beispiel #19
0
 def test_defining_a_reference_field(self):
     builder = Builder()
     builder.ref("foo")
     assert builder._ref == "foo"
Beispiel #20
0
 def test_k1_can_be_set(self):
     builder = Builder()
     builder.k1(1.6)
     assert builder._k1 == 1.6
Beispiel #21
0
 def test_b_higher_than_one(self):
     builder = Builder()
     builder.b(1.5)
     assert builder._b == 1
Beispiel #22
0
    def generate_output(self, writer):
        pages = [self.create_node(x) for x in self.context['articles']]
        path = os.path.join(self.output_path, 'search_index.json')

        pages_to_index = [{
            'id': x['id'],
            'title': x['title'],
            'text': x['text']
        } for x in pages]

        additional_data = {
            x['id']: {
                'url': x['url'],
                'title': x['title'],
                'summary': x['summary'],
            }
            for x in pages
        }

        Pipeline.register_function(special_chars_remover,
                                   'specialCharsRemover')

        bldr = Builder()
        bldr.pipeline.add(trimmer, stop_word_filter, stemmer,
                          special_chars_remover)
        bldr.search_pipeline.add(stemmer)
        bldr.ref('id')
        bldr.field('title', 10)
        bldr.field('text')

        for page in pages_to_index:
            bldr.add(page)
        idx = bldr.build().serialize()

        with open(path, 'w') as idxfile:
            json.dump({
                'index': idx,
                'data': additional_data,
            }, idxfile)
Beispiel #23
0
 def test_b_within_range(self):
     builder = Builder()
     builder.b(0.5)
     assert builder._b == 0.5
Beispiel #24
0
 def test_b_default_value(self):
     builder = Builder()
     assert builder._b == 0.75