def test_register_function_warns_when_adding_function_with_same_label( self): Pipeline.register_function(self.fn, "fn") with patch("lunr.pipeline.log") as mock_log: Pipeline.register_function(self.fn, "fn") mock_log.warning.assert_called_once()
def register_languages(): """Register all supported languages to ensure compatibility.""" for language in set(SUPPORTED_LANGUAGES) - {"en"}: language_stemmer = partial(nltk_stemmer, get_language_stemmer(language)) Pipeline.register_function(language_stemmer, "stemmer-{}".format(language))
def test_load_with_registered_functions(self): serialized_pipeline = ["fn"] Pipeline.register_function(fn, "fn") pipeline = Pipeline.load(serialized_pipeline) assert len(pipeline) == 1 assert pipeline._stack[0] == fn
def __init__(self): self._ref = "id" self._fields = {} self.inverted_index = {} self.field_term_frequencies = {} self.field_lengths = {} self.pipeline = Pipeline() self.search_pipeline = Pipeline() self._documents = {} self.document_count = 0 self._b = 0.75 self._k1 = 1.2 self.term_index = 0 self.metadata_whitelist = []
def generate_stop_word_filter(stop_words, language=None): """Builds a stopWordFilter function from the provided list of stop words. The built in `stop_word_filter` is built using this factory and can be used to generate custom `stop_word_filter` for applications or non English languages. """ def stop_word_filter(token, i=None, tokens=None): if token and str(token) not in stop_words: return token # camelCased for for compatibility with lunr.js label = ("stopWordFilter-{}".format(language) if language is not None else "stopWordFilter") Pipeline.register_function(stop_word_filter, label) return stop_word_filter
def load(cls, serialized_index): """Load a serialized index""" from lunr import __TARGET_JS_VERSION__ if isinstance(serialized_index, str): serialized_index = json.loads(serialized_index) if serialized_index["version"] != __TARGET_JS_VERSION__: logger.warning( "Version mismatch when loading serialized index. " "Current version of lunr {} does not match that of serialized " "index {}".format(__TARGET_JS_VERSION__, serialized_index["version"])) field_vectors = { ref: Vector(elements) for ref, elements in serialized_index["fieldVectors"] } tokenset_builder = TokenSetBuilder() inverted_index = {} for term, posting in serialized_index["invertedIndex"]: tokenset_builder.insert(term) inverted_index[term] = posting tokenset_builder.finish() return Index( fields=serialized_index["fields"], field_vectors=field_vectors, inverted_index=inverted_index, token_set=tokenset_builder.root, pipeline=Pipeline.load(serialized_index["pipeline"]), )
def test_add_token_metadata(): builder = get_default_builder() def token_length(token, i, tokens): token.metadata["token_length"] = len(str(token)) return token Pipeline.register_function(token_length) builder.pipeline.add(token_length) builder.metadata_whitelist.append("token_length") idx = lunr("id", ("title", "body"), documents, builder=builder) [result, _, _] = idx.search("green") assert result["match_data"].metadata["green"]["title"]["token_length"] == [ 5 ] assert result["match_data"].metadata["green"]["body"]["token_length"] == [ 5, 5 ]
def get_nltk_builder(languages): """Returns a builder with stemmers for all languages added to it. Args: languages (list): A list of supported languages. """ #all_stemmers = [] all_stopwords_filters = [] all_word_characters = set() for language in languages: if language == "en": # use Lunr's defaults #all_stemmers.append(lunr.stemmer.stemmer) all_stopwords_filters.append(stop_word_filter) all_word_characters.update({r"\w"}) else: stopwords, word_characters = _get_stopwords_and_word_characters(language) #all_stemmers.append( # Pipeline.registered_functions["stemmer-{}".format(language)] #) all_stopwords_filters.append( generate_stop_word_filter(stopwords, language=language) ) all_word_characters.update(word_characters) builder = Builder() multi_trimmer = generate_trimmer("".join(sorted(all_word_characters))) Pipeline.register_function( multi_trimmer, "lunr-multi-trimmer-{}".format("-".join(languages)) ) builder.pipeline.reset() for fn in chain([multi_trimmer], all_stopwords_filters):#, all_stemmers): builder.pipeline.add(fn) #for fn in all_stemmers: # builder.search_pipeline.add(fn) return builder
def generate_output(self, writer): pages = [self.create_node(x) for x in self.context['articles']] path = os.path.join(self.output_path, 'search_index.json') pages_to_index = [{ 'id': x['id'], 'title': x['title'], 'text': x['text'] } for x in pages] additional_data = { x['id']: { 'url': x['url'], 'title': x['title'], 'summary': x['summary'], } for x in pages } Pipeline.register_function(special_chars_remover, 'specialCharsRemover') bldr = Builder() bldr.pipeline.add(trimmer, stop_word_filter, stemmer, special_chars_remover) bldr.search_pipeline.add(stemmer) bldr.ref('id') bldr.field('title', 10) bldr.field('text') for page in pages_to_index: bldr.add(page) idx = bldr.build().serialize() with open(path, 'w') as idxfile: json.dump({ 'index': idx, 'data': additional_data, }, idxfile)
def test_many_token_to_token_array(self, many_tokens, benchmark): token_to_token_array_pipeline = Pipeline() token_to_token_array_pipeline.add(self.token_to_token_array) benchmark(token_to_token_array_pipeline.run, many_tokens)
def test_few_token_to_token(self, few_tokens, benchmark): token_to_token_pipeline = Pipeline() token_to_token_pipeline.add(self.token_to_token) benchmark(token_to_token_pipeline.run, few_tokens)
# With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the # published algorithm. Remove the line to match the published # algorithm. self.step1ab() self.step1c() self.step2() self.step3() self.step4() self.step5() return self.b[self.k0:self.k + 1] porter_stemmer = PorterStemmer() def stemmer(token, i=None, tokens=None): """Wrapper around the PorterStemmer for inclusion in pipeline. Args: language (str): ISO-639-1 code of the language. token (lunr.Token): The token to stem. i (int): The index of the token in a set. tokens (list): A list of tokens representing the set. """ return token.update(porter_stemmer.stem) Pipeline.register_function(stemmer, "stemmer")
import re from lunr.pipeline import Pipeline full_re = re.compile(r"^\W*?([^\W]+)\W*?$") def trimmer(token, i=None, tokens=None): def trim(s, metadata=None): match = full_re.match(s) if match is None: return s return match.group(1) return token.update(trim) Pipeline.register_function(trimmer, "trimmer")
class Builder: """Performs indexing on a set of documents and returns instances of lunr.Index ready for querying. All configuration of the index is done via the builder, the fields to index, the document reference, the text processing pipeline and document scoring parameters are all set on the builder before indexing. """ def __init__(self): self._ref = "id" self._fields = {} self.inverted_index = {} self.field_term_frequencies = {} self.field_lengths = {} self.pipeline = Pipeline() self.search_pipeline = Pipeline() self._documents = {} self.document_count = 0 self._b = 0.75 self._k1 = 1.2 self.term_index = 0 self.metadata_whitelist = [] def ref(self, ref): """Sets the document field used as the document reference. Every document must have this field. The type of this field in the document should be a string, if it is not a string it will be coerced into a string by calling `str`. The default ref is 'id'. The ref should _not_ be changed during indexing, it should be set before any documents are added to the index. Changing it during indexing can lead to inconsistent results. """ self._ref = ref def field(self, field_name, boost=1, extractor=None): """Adds a field to the list of document fields that will be indexed. Every document being indexed should have this field. None values for this field in indexed documents will not cause errors but will limit the chance of that document being retrieved by searches. All fields should be added before adding documents to the index. Adding fields after a document has been indexed will have no effect on already indexed documents. Fields can be boosted at build time. This allows terms within that field to have more importance on search results. Use a field boost to specify that matches within one field are more important that other fields. Args: field_name (str): Name of the field to be added, must not include a forward slash '/'. boost (int): Optional boost factor to apply to field. extractor (callable): Optional function to extract a field from the document. Raises: ValueError: If the field name contains a `/`. """ if "/" in field_name: raise ValueError("Field {} contains illegal character `/`") self._fields[field_name] = Field(field_name, boost, extractor) def b(self, number): """A parameter to tune the amount of field length normalisation that is applied when calculating relevance scores. A value of 0 will completely disable any normalisation and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b will be clamped to the range 0 - 1. """ if number < 0: self._b = 0 elif number > 1: self._b = 1 else: self._b = number def k1(self, number): """ A parameter that controls the speed at which a rise in term frequency results in term frequency saturation. The default value is 1.2. Setting this to a higher value will give slower saturation levels, a lower value will result in quicker saturation. """ self._k1 = number def add(self, doc, attributes=None): """Adds a document to the index. Before adding documents to the index it should have been fully setup, with the document ref and all fields to index already having been specified. The document must have a field name as specified by the ref (by default this is 'id') and it should have all fields defined for indexing, though None values will not cause errors. Args: - doc (dict): The document to be added to the index. - attributes (dict, optional): A set of attributes corresponding to the document, currently a single `boost` -> int will be taken into account. """ doc_ref = str(doc[self._ref]) self._documents[doc_ref] = attributes or {} self.document_count += 1 for field_name, field in self._fields.items(): extractor = field.extractor field_value = doc[field_name] if extractor is None else extractor( doc) tokens = Tokenizer(field_value) terms = self.pipeline.run(tokens) field_ref = FieldRef(doc_ref, field_name) field_terms = defaultdict(int) # TODO: field_refs are casted to strings in JS, should we allow # FieldRef as keys? self.field_term_frequencies[str(field_ref)] = field_terms self.field_lengths[str(field_ref)] = len(terms) for term in terms: # TODO: term is a Token, should we allow Tokens as keys? term_key = str(term) field_terms[term_key] += 1 if term_key not in self.inverted_index: posting = {_field_name: {} for _field_name in self._fields} posting["_index"] = self.term_index self.term_index += 1 self.inverted_index[term_key] = posting if doc_ref not in self.inverted_index[term_key][field_name]: self.inverted_index[term_key][field_name][ doc_ref] = defaultdict(list) for metadata_key in self.metadata_whitelist: metadata = term.metadata[metadata_key] self.inverted_index[term_key][field_name][doc_ref][ metadata_key].append(metadata) def build(self): """Builds the index, creating an instance of `lunr.Index`. This completes the indexing process and should only be called once all documents have been added to the index. """ self._calculate_average_field_lengths() self._create_field_vectors() self._create_token_set() return Index( inverted_index=self.inverted_index, field_vectors=self.field_vectors, token_set=self.token_set, fields=list(self._fields.keys()), pipeline=self.search_pipeline, ) def _create_token_set(self): """Creates a token set of all tokens in the index using `lunr.TokenSet` """ self.token_set = TokenSet.from_list( sorted(list(self.inverted_index.keys()))) def _calculate_average_field_lengths(self): """Calculates the average document length for this index""" accumulator = defaultdict(int) documents_with_field = defaultdict(int) for field_ref, length in self.field_lengths.items(): _field_ref = FieldRef.from_string(field_ref) field = _field_ref.field_name documents_with_field[field] += 1 accumulator[field] += length for field_name in self._fields: accumulator[field_name] /= documents_with_field[field_name] self.average_field_length = accumulator def _create_field_vectors(self): """Builds a vector space model of every document using lunr.Vector.""" field_vectors = {} term_idf_cache = {} for field_ref, term_frequencies in self.field_term_frequencies.items(): _field_ref = FieldRef.from_string(field_ref) field_name = _field_ref.field_name field_length = self.field_lengths[field_ref] field_vector = Vector() field_boost = self._fields[field_name].boost doc_boost = self._documents[_field_ref.doc_ref].get("boost", 1) for term, tf in term_frequencies.items(): term_index = self.inverted_index[term]["_index"] if term not in term_idf_cache: idf = Idf(self.inverted_index[term], self.document_count) term_idf_cache[term] = idf else: idf = term_idf_cache[term] score = ( idf * ((self._k1 + 1) * tf) / (self._k1 * (1 - self._b + self._b * (field_length / self.average_field_length[field_name])) + tf)) score *= field_boost score *= doc_boost score_with_precision = round(score, 3) field_vector.insert(term_index, score_with_precision) field_vectors[field_ref] = field_vector self.field_vectors = field_vectors def use(self, fn, *args, **kwargs): """Applies a plugin to the index builder. A plugin is a function that is called with the index builder as its context. Plugins can be used to customise or extend the behaviour of the index in some way. A plugin is just a function, that encapsulated the custom behaviour that should be applied when building the index. The plugin function will be called with the index builder as its argument, additional arguments can also be passed when calling use. """ fn(self, *args, **kwargs)
def test_load_with_unregistered_functions(self): serialized_pipeline = ["fn"] with pytest.raises(BaseLunrException): Pipeline.load(serialized_pipeline)
def test_register_function_adds_defaults_to_name_of_the_function(self): Pipeline.register_function(self.fn) assert self.fn.label == self.fn.__name__
def test_register_function_adds_function_to_list_of_registered_functions( self): Pipeline.register_function(self.fn, "fn") assert Pipeline.registered_functions["fn"] == self.fn
def test_register_function_adds_a_label_property_to_the_function(self): Pipeline.register_function(self.fn, "fn") assert self.fn.label == "fn"
def test_serialize_returns_array_of_registered_function_labels(self): Pipeline.register_function(fn, "fn") self.pipeline.add(fn) assert self.pipeline.serialize() == ["fn"] assert repr(self.pipeline) == '<Pipeline stack="fn">'
def setup_mock_pipline(self, monkeypatch): monkeypatch.setattr(Pipeline, "registered_functions", {}) monkeypatch.setattr(Pipeline, "warn_if_function_not_registered", noop) self.pipeline = Pipeline()