def _search(self, language, fallback=True): main, sub = splitLanguage(language) if main not in self._index: return None if fallback: # Search in sorted order, specific sub tag first, None second subs = list(self._index[main].keys()) subs.sort() if sub in subs: subs.remove(sub) subs.insert(0, sub) else: subs = [sub] result = OOSet() for sublanguage in subs: result = oo_union(result, self._index[main][sublanguage]) return IISet(entry.docid for entry in result)
def languageindex_search(self, language, fallback=True, res=None): main, sub = splitLanguage(language) if main not in self._index: return None if fallback: # Search in sorted order, specific sub tag first, None second subs = list(self._index[main].keys()) subs.sort() if sub in subs: subs.remove(sub) subs.insert(0, sub) else: subs = [sub] if not fallback and res is not None: # We do not support any optimization when fallback is enabled. # # TODO: The core loop is not in C here. Casual benchmarks suggest this # is still more effecient than trying to move it to C. The problem is # that we only have an IISet of docids as an input. We need to filter # this per language. The available index structures we have are: # # IndexEntry objects used as entries. Complex objects storing docid, # main and sub languages and UID of the canonical. Their hash and # compare function uses the canonical UID. # # self._index # An OOBTreeSet structure per language. In the outermost nodes we have # OOBTree's per language. Useful to get all items in a language. # Otherwise useless, as we would have to compare the docid attribute # of the object in the tree against our wanted set, requiring a full # loop over all items. # # self._unindex # An IOBTree of docid to entry. Better to match our docid wanted set, # but we would still have to compare the language code to the entry # object itself. # # self._sortindex # An IOBTree of docid to language tag. Looks like the best candidate # for us, as we can compare the language directly as a simple string # comparision. # # One thing to keep in mind, is that once we get a wanted set, this # will usually have gone through a path query already. This means # we will almost always already have matching set and won't filter # out any item at all. So the edge-case of a 100% match is actually # the most common one for us. # # Casual benchmarks show that trying to construct an IOBTree from the # wanted set and intersecting it with the sortindex is still slower # than having the core loop in Python code. tag = lang_tag(main, sub) result = IISet() for r in res: lang = self._sortindex.get(r) if lang == tag: result.insert(r) return result result = OOSet() for sublanguage in subs: result = oo_union(result, self._index[main][sublanguage]) return IISet(entry.docid for entry in result)