class TestBuilderBuild: def setup_method(self, method): self.builder = Builder() doc = {"id": "id", "title": "test", "body": "missing"} self.builder.ref("id") self.builder.field("title") self.builder.add(doc) self.index = self.builder.build() def test_adds_tokens_to_inverted_index(self): _assert_deep_keys(self.builder.inverted_index, "test.title.id") def test_builds_vector_space_of_the_document_fields(self): assert "title/id" in self.builder.field_vectors assert isinstance(self.builder.field_vectors["title/id"], Vector) def test_skips_fields_not_defined_for_indexing(self): assert "missing" not in self.builder.inverted_index def test_builds_a_token_set_for_the_corpus(self): needle = TokenSet.from_string("test") assert "test" in self.builder.token_set.intersect(needle).to_list() def test_calculates_document_count(self): assert self.builder.average_field_length["title"] == 1 def test_index_is_returned(self): assert isinstance(self.index, Index)
def test_define_fields_to_index(self): builder = Builder() builder.field("foo") assert len(builder._fields) == 1 assert builder._fields["foo"].name == "foo" assert builder._fields["foo"].boost == 1 assert builder._fields["foo"].extractor is None assert repr(builder._fields["foo"]) == '<Field "foo" boost="1">' assert hash(builder._fields["foo"]) == hash("foo")
class TestBuilderAdd: def test_builder_casts_docrefs_to_strings(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.add(dict(id=123, title="test", body="missing")) _assert_deep_keys(self.builder.inverted_index, "test.title.123") def test_builder_metadata_whitelist_includes_metadata_in_index(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.metadata_whitelist = ["position"] self.builder.add(dict(id="a", title="test", body="missing")) self.builder.add(dict(id="b", title="another test", body="missing")) assert self.builder.inverted_index["test"]["title"]["a"] == { "position": [[0, 4]] } assert self.builder.inverted_index["test"]["title"]["b"] == { "position": [[8, 4]] } def test_builder_field_raises_if_contains_slash(self): self.builder = Builder() with pytest.raises(ValueError): self.builder.field("foo/bar") def test_builder_extracts_nested_properties_from_document(self): self.builder = Builder() self.builder.field("name", extractor=lambda d: d["person"]["name"]) self.builder.add({"id": "id", "person": {"name": "bob"}}) assert self.builder.inverted_index["bob"]["name"]["id"] == {} def test_builder_field_term_frequency_and_length(self): self.builder = Builder() self.builder.ref("id") self.builder.field("title") self.builder.add( dict(id="a", title="test a testing test", body="missing")) assert self.builder.field_term_frequencies == { "title/a": { "test": 2, "a": 1, "testing": 1 } } assert self.builder.field_lengths == {"title/a": 4}
def lunr_builder(ref, fields): """A convenience function to configure and construct a lunr.Builder. Returns: Index: The populated Index ready to search against. """ builder = Builder() builder.pipeline.add(trimmer, stop_word_filter, stemmer) builder.search_pipeline.add(stemmer) builder.ref(ref) for field in fields: builder.field(field) return builder
def lunr(ref, fields, documents, languages=None): """A convenience function to configure and construct a lunr.Index. Args: ref (str): The key in the documents to be used a the reference. fields (list): A list of strings defining fields in the documents to index. Optionally a list of dictionaries with three keys: `field_name` defining the document's field, `boost` an integer defining a boost to be applied to the field, and `extractor` a callable taking the document as a single argument and returning a string located in the document in a particular way. documents (list): The list of dictonaries representing the documents to index. Optionally a 2-tuple of dicts, the first one being the document and the second the associated attributes to it. languages (str or list, optional): The languages to use if using NLTK language support, ignored if NLTK is not available. Returns: Index: The populated Index ready to search against. """ if languages is not None and lang.LANGUAGE_SUPPORT: if isinstance(languages, basestring): languages = [languages] unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES) if unsupported_languages: raise RuntimeError("The specified languages {} are not supported, " "please choose one of {}".format( ", ".join(unsupported_languages), ", ".join(lang.SUPPORTED_LANGUAGES.keys()), )) builder = lang.get_nltk_builder(languages) else: builder = Builder() builder.pipeline.add(trimmer, stop_word_filter) #, stemmer) #builder.search_pipeline.add(stemmer) builder.ref(ref) for field in fields: if isinstance(field, dict): builder.field(**field) else: builder.field(field) for document in documents: if isinstance(document, (tuple, list)): builder.add(document[0], attributes=document[1]) else: builder.add(document) return builder.build()
def generate_output(self, writer): pages = [self.create_node(x) for x in self.context['articles']] path = os.path.join(self.output_path, 'search_index.json') pages_to_index = [{ 'id': x['id'], 'title': x['title'], 'text': x['text'] } for x in pages] additional_data = { x['id']: { 'url': x['url'], 'title': x['title'], 'summary': x['summary'], } for x in pages } Pipeline.register_function(special_chars_remover, 'specialCharsRemover') bldr = Builder() bldr.pipeline.add(trimmer, stop_word_filter, stemmer, special_chars_remover) bldr.search_pipeline.add(stemmer) bldr.ref('id') bldr.field('title', 10) bldr.field('text') for page in pages_to_index: bldr.add(page) idx = bldr.build().serialize() with open(path, 'w') as idxfile: json.dump({ 'index': idx, 'data': additional_data, }, idxfile)