def test_builder_extracts_nested_properties_from_document(self): self.builder = Builder() self.builder.field("name", extractor=lambda d: d["person"]["name"]) self.builder.add({"id": "id", "person": {"name": "bob"}}) assert self.builder.inverted_index["bob"]["name"]["id"] == {}
def setup_method(self, method): self.builder = Builder() doc = {"id": "id", "title": "test", "body": "missing"} self.builder.ref("id") self.builder.field("title") self.builder.add(doc) self.index = self.builder.build()
def test_builder_casts_docrefs_to_strings(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.add(dict(id=123, title="test", body="missing")) _assert_deep_keys(self.builder.inverted_index, "test.title.123")
def test_define_fields_to_index(self): builder = Builder() builder.field("foo") assert len(builder._fields) == 1 assert builder._fields["foo"].name == "foo" assert builder._fields["foo"].boost == 1 assert builder._fields["foo"].extractor is None assert repr(builder._fields["foo"]) == '<Field "foo" boost="1">' assert hash(builder._fields["foo"]) == hash("foo")
def test_builder_field_term_frequency_and_length(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.add(dict(id="a", title="test a testing test", body="missing")) assert self.builder.field_term_frequencies == { "title/a": {"test": 2, "a": 1, "testing": 1} } assert self.builder.field_lengths == {"title/a": 4}
def test_builder_metadata_whitelist_includes_metadata_in_index(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.metadata_whitelist = ["position"] self.builder.add(dict(id="a", title="test", body="missing")) self.builder.add(dict(id="b", title="another test", body="missing")) assert self.builder.inverted_index["test"]["title"]["a"] == { "position": [[0, 4]] } assert self.builder.inverted_index["test"]["title"]["b"] == { "position": [[8, 4]] }
class TestBuilderBuild: def setup_method(self, method): self.builder = Builder() doc = {"id": "id", "title": "test", "body": "missing"} self.builder.ref("id") self.builder.field("title") self.builder.add(doc) self.index = self.builder.build() def test_adds_tokens_to_inverted_index(self): _assert_deep_keys(self.builder.inverted_index, "test.title.id") def test_builds_vector_space_of_the_document_fields(self): assert "title/id" in self.builder.field_vectors assert isinstance(self.builder.field_vectors["title/id"], Vector) def test_skips_fields_not_defined_for_indexing(self): assert "missing" not in self.builder.inverted_index def test_builds_a_token_set_for_the_corpus(self): needle = TokenSet.from_string("test") assert "test" in self.builder.token_set.intersect(needle).to_list() def test_calculates_document_count(self): assert self.builder.average_field_length["title"] == 1 def test_index_is_returned(self): assert isinstance(self.index, Index)
def lunr_builder(ref, fields): """A convenience function to configure and construct a lunr.Builder. Returns: Index: The populated Index ready to search against. """ builder = Builder() builder.pipeline.add(trimmer, stop_word_filter, stemmer) builder.search_pipeline.add(stemmer) builder.ref(ref) for field in fields: builder.field(field) return builder
def get_nltk_builder(languages): """Returns a builder with stemmers for all languages added to it. Args: languages (list): A list of supported languages. """ #all_stemmers = [] all_stopwords_filters = [] all_word_characters = set() for language in languages: if language == "en": # use Lunr's defaults #all_stemmers.append(lunr.stemmer.stemmer) all_stopwords_filters.append(stop_word_filter) all_word_characters.update({r"\w"}) else: stopwords, word_characters = _get_stopwords_and_word_characters(language) #all_stemmers.append( # Pipeline.registered_functions["stemmer-{}".format(language)] #) all_stopwords_filters.append( generate_stop_word_filter(stopwords, language=language) ) all_word_characters.update(word_characters) builder = Builder() multi_trimmer = generate_trimmer("".join(sorted(all_word_characters))) Pipeline.register_function( multi_trimmer, "lunr-multi-trimmer-{}".format("-".join(languages)) ) builder.pipeline.reset() for fn in chain([multi_trimmer], all_stopwords_filters):#, all_stemmers): builder.pipeline.add(fn) #for fn in all_stemmers: # builder.search_pipeline.add(fn) return builder
def get_default_builder(languages=None): """Creates a new pre-configured instance of Builder. Useful as a starting point to tweak the defaults. """ if languages is not None and lang.LANGUAGE_SUPPORT: if isinstance(languages, str): languages = [languages] unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES) if unsupported_languages: raise RuntimeError("The specified languages {} are not supported, " "please choose one of {}".format( ", ".join(unsupported_languages), ", ".join(lang.SUPPORTED_LANGUAGES.keys()), )) builder = lang.get_nltk_builder(languages) else: builder = Builder() builder.pipeline.add(trimmer, stop_word_filter, stemmer) builder.search_pipeline.add(stemmer) return builder
class TestBuilderUse: def setup_method(self, method): self.builder = Builder() def test_calls_plugin_function(self): def plugin(*args): assert True self.builder.use(plugin) def test_plugin_is_called_with_builder_as_first_argument(self): def plugin(builder): assert builder is self.builder self.builder.use(plugin) def test_forwards_arguments_to_the_plugin(self): def plugin(builder, *args, **kwargs): assert args == (1, 2, 3) assert kwargs == {"foo": "bar"} self.builder.use(plugin, 1, 2, 3, foo="bar")
def test_b_less_than_zero(self): builder = Builder() builder.b(-1) assert builder._b == 0
def test_builder_field_raises_if_contains_slash(self): self.builder = Builder() with pytest.raises(ValueError): self.builder.field("foo/bar")
def lunr(ref, fields, documents, languages=None): """A convenience function to configure and construct a lunr.Index. Args: ref (str): The key in the documents to be used a the reference. fields (list): A list of strings defining fields in the documents to index. Optionally a list of dictionaries with three keys: `field_name` defining the document's field, `boost` an integer defining a boost to be applied to the field, and `extractor` a callable taking the document as a single argument and returning a string located in the document in a particular way. documents (list): The list of dictonaries representing the documents to index. Optionally a 2-tuple of dicts, the first one being the document and the second the associated attributes to it. languages (str or list, optional): The languages to use if using NLTK language support, ignored if NLTK is not available. Returns: Index: The populated Index ready to search against. """ if languages is not None and lang.LANGUAGE_SUPPORT: if isinstance(languages, basestring): languages = [languages] unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES) if unsupported_languages: raise RuntimeError("The specified languages {} are not supported, " "please choose one of {}".format( ", ".join(unsupported_languages), ", ".join(lang.SUPPORTED_LANGUAGES.keys()), )) builder = lang.get_nltk_builder(languages) else: builder = Builder() builder.pipeline.add(trimmer, stop_word_filter) #, stemmer) #builder.search_pipeline.add(stemmer) builder.ref(ref) for field in fields: if isinstance(field, dict): builder.field(**field) else: builder.field(field) for document in documents: if isinstance(document, (tuple, list)): builder.add(document[0], attributes=document[1]) else: builder.add(document) return builder.build()
class TestBuilderAdd: def test_builder_casts_docrefs_to_strings(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.add(dict(id=123, title="test", body="missing")) _assert_deep_keys(self.builder.inverted_index, "test.title.123") def test_builder_metadata_whitelist_includes_metadata_in_index(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.metadata_whitelist = ["position"] self.builder.add(dict(id="a", title="test", body="missing")) self.builder.add(dict(id="b", title="another test", body="missing")) assert self.builder.inverted_index["test"]["title"]["a"] == { "position": [[0, 4]] } assert self.builder.inverted_index["test"]["title"]["b"] == { "position": [[8, 4]] } def test_builder_field_raises_if_contains_slash(self): self.builder = Builder() with pytest.raises(ValueError): self.builder.field("foo/bar") def test_builder_extracts_nested_properties_from_document(self): self.builder = Builder() self.builder.field("name", extractor=lambda d: d["person"]["name"]) self.builder.add({"id": "id", "person": {"name": "bob"}}) assert self.builder.inverted_index["bob"]["name"]["id"] == {} def test_builder_field_term_frequency_and_length(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.add( dict(id="a", title="test a testing test", body="missing")) assert self.builder.field_term_frequencies == { "title/a": { "test": 2, "a": 1, "testing": 1 } } assert self.builder.field_lengths == {"title/a": 4}
def test_k1_default_value(self): builder = Builder() assert builder._k1 == 1.2
def setup_method(self, method): self.builder = Builder()
def test_default_reference(self): builder = Builder() assert builder._ref == "id"
def test_defining_a_reference_field(self): builder = Builder() builder.ref("foo") assert builder._ref == "foo"
def test_k1_can_be_set(self): builder = Builder() builder.k1(1.6) assert builder._k1 == 1.6
def test_b_higher_than_one(self): builder = Builder() builder.b(1.5) assert builder._b == 1
def generate_output(self, writer): pages = [self.create_node(x) for x in self.context['articles']] path = os.path.join(self.output_path, 'search_index.json') pages_to_index = [{ 'id': x['id'], 'title': x['title'], 'text': x['text'] } for x in pages] additional_data = { x['id']: { 'url': x['url'], 'title': x['title'], 'summary': x['summary'], } for x in pages } Pipeline.register_function(special_chars_remover, 'specialCharsRemover') bldr = Builder() bldr.pipeline.add(trimmer, stop_word_filter, stemmer, special_chars_remover) bldr.search_pipeline.add(stemmer) bldr.ref('id') bldr.field('title', 10) bldr.field('text') for page in pages_to_index: bldr.add(page) idx = bldr.build().serialize() with open(path, 'w') as idxfile: json.dump({ 'index': idx, 'data': additional_data, }, idxfile)
def test_b_within_range(self): builder = Builder() builder.b(0.5) assert builder._b == 0.5
def test_b_default_value(self): builder = Builder() assert builder._b == 0.75