def test_requires(): a = Term("f", u("a")) b = Term("f", u("b")) assert_equal(And([a, b]).requires(), set([a, b])) assert_equal(Or([a, b]).requires(), set()) assert_equal(AndMaybe(a, b).requires(), set([a])) assert_equal(a.requires(), set([a]))
def test_span_term(): ix = get_index() with ix.searcher() as s: alllists = [d["text"] for d in s.all_stored_fields()] for word in domain: q = Term("text", word) m = q.matcher(s) ids = set() while m.is_active(): id = m.id() sps = m.spans() ids.add(id) original = list(s.stored_fields(id)["text"]) assert word in original if word != "bravo": assert len(sps) == 1 assert original.index(word) == sps[0].start assert original.index(word) == sps[0].end m.next() for i, ls in enumerate(alllists): if word in ls: assert i in ids else: assert i not in ids
def test_requires(): a = Term("f", u("a")) b = Term("f", u("b")) assert And([a, b]).requires() == set([a, b]) assert Or([a, b]).requires() == set() assert AndMaybe(a, b).requires() == set([a]) assert a.requires() == set([a])
def test_excludematcher(): schema = fields.Schema(content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "charlie", "delta") for _ in xrange(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) w.commit(merge=False) w = ix.writer() w.delete_document(5) w.delete_document(10) w.delete_document(28) w.commit(merge=False) q = Term("content", "bravo") with ix.searcher() as s: m = q.matcher(s) while m.is_active(): content = s.stored_fields(m.id())["content"].split() spans = m.spans() for span in spans: assert content[span.start] == "bravo" m.next()
def validate_name(meta, itemid): """ Check whether the names are valid. Will just return, if they are valid, will raise a NameNotValidError if not. """ names = meta.get(NAME) current_namespace = meta.get(NAMESPACE) if current_namespace is None: raise NameNotValidError(L_("No namespace field in the meta.")) namespaces = [ namespace.rstrip('/') for namespace, _ in app.cfg.namespace_mapping ] if len(names) != len(set(names)): msg = L_("The names in the name list must be unique.") flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg) # Item names must not start with '@' or '+', '@something' denotes a field where as '+something' denotes a view. invalid_names = [name for name in names if name.startswith(('@', '+'))] if invalid_names: msg = L_( "Item names (%(invalid_names)s) must not start with '@' or '+'", invalid_names=", ".join(invalid_names)) flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg) namespaces = namespaces + NAMESPACES_IDENTIFIER # Also dont allow item names to match with identifier namespaces. # Item names must not match with existing namespaces. invalid_names = [ name for name in names if name.split('/', 1)[0] in namespaces ] if invalid_names: msg = L_( "Item names (%(invalid_names)s) must not match with existing namespaces.", invalid_names=", ".join(invalid_names)) flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg) query = And([ Or([Term(NAME, name) for name in names]), Term(NAMESPACE, current_namespace) ]) # There should be not item existing with the same name. if itemid is not None: query = And([query, Not(Term(ITEMID, itemid)) ]) # search for items except the current item. with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher: results = searcher.search(query) duplicate_names = { name for result in results for name in result[NAME] if name in names } if duplicate_names: msg = L_("Item(s) named %(duplicate_names)s already exist.", duplicate_names=", ".join(duplicate_names)) flash(msg, "error") # duplicate message at top of form raise NameNotValidError(msg)
def get_index(self, startswith=None, selected_groups=None): fqname = self.fqname isglobalindex = not fqname.value or fqname.value == NAMESPACE_ALL query = Term(WIKINAME, app.cfg.interwikiname) & self.build_index_query( startswith, selected_groups, isglobalindex) if not fqname.value.startswith(NAMESPACE_ALL + '/') and fqname.value != NAMESPACE_ALL: query = Term(NAMESPACE, fqname.namespace) & query revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None) return self.make_flat_index(revs, isglobalindex)
def _trashed(namespace): q = And([Term(WIKINAME, app.cfg.interwikiname), Term(TRASH, True)]) if namespace != NAMESPACE_ALL: q = And([q, Term(NAMESPACE, namespace), ]) trashedEntry = namedtuple('trashedEntry', 'fqname oldname revid rev_number mtime comment editor parentid') results = [] for meta in flaskg.storage.search_meta(q, limit=None): fqname = CompositeName(meta[NAMESPACE], ITEMID, meta[ITEMID]) results.append(trashedEntry(fqname, meta[NAME_OLD], meta[REVID], meta[REV_NUMBER], meta[MTIME], meta[COMMENT], get_editor_info(meta), meta[PARENTID])) return results
def test_replace(): q = And([ Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0) ]) q = q.replace("a", "b", "BB") assert q == And([ Or([Term("a", "BB"), Term("b", "c")], boost=1.2), Variations("a", "BB", boost=2.0) ])
def _trashed(namespace): q = And([Term(WIKINAME, app.cfg.interwikiname), Term(TRASH, True)]) if namespace != NAMESPACE_ALL: q = And([q, Term(NAMESPACE, namespace), ]) trashedEntry = namedtuple('trashedEntry', 'fqname oldname revid mtime comment editor') results = [] for rev in flaskg.storage.search(q, limit=None): meta = rev.meta results.append(trashedEntry(rev.fqname, meta[NAME_OLD], meta[REVID], meta[MTIME], meta[COMMENT], get_editor_info(meta))) return results
def test_duplicates(): q = And([Term("a", u("b")), Term("a", u("b"))]) assert_equal(q.normalize(), Term("a", u("b"))) q = And([Prefix("a", u("b")), Prefix("a", u("b"))]) assert_equal(q.normalize(), Prefix("a", u("b"))) q = And([ Variations("a", u("b")), And([Variations("a", u("b")), Term("a", u("b"))]) ]) assert_equal(q.normalize(), And([Variations("a", u("b")), Term("a", u("b"))])) q = And( [Term("a", u("b")), Prefix("a", u("b")), Term("a", u("b"), boost=1.1)]) assert_equal(q.normalize(), q) # Wildcard without * or ? normalizes to Term q = And([ Wildcard("a", u("b")), And([Wildcard("a", u("b")), Term("a", u("b"))]) ]) assert_equal(q.normalize(), Term("a", u("b")))
def test_span_before(): ix = get_index() with ix.searcher() as s: bq = spans.SpanBefore(Term("text", "alfa"), Term("text", "charlie")) m = bq.matcher(s) while m.is_active(): orig = list(s.stored_fields(m.id())["text"]) assert "alfa" in orig assert "charlie" in orig assert orig.index("alfa") < orig.index("charlie") m.next()
def test_regular_and(): ix = get_index() with ix.searcher() as s: aq = And([Term("text", "bravo"), Term("text", "alfa")]) m = aq.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] for span in m.spans(): v = orig[span.start] assert v == "bravo" or v == "alfa" m.next()
def get_templates(self, contenttype=None): """ create a list of templates (for some specific contenttype) """ terms = [ Term(WIKINAME, app.cfg.interwikiname), Term(TAGS, u'template') ] if contenttype is not None: terms.append(Term(CONTENTTYPE, contenttype)) query = And(terms) revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None) return [rev.name for rev in revs]
def search_addresses(searcher, query): restrict_q = Term("tag", "drafts") | Term("tag", "trash") results = [] for field in ['to', 'cc', 'bcc', 'sender']: query_parser = QueryParser(field, searcher.schema) results.append( searcher.search(query_parser.parse("*%s*" % query), limit=None, mask=restrict_q, groupedby=sorting.FieldFacet( field, allow_overlap=True)).groups()) return flatten(results)
def test_span_or(): ix = get_index() with ix.searcher() as s: nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), slop=2) bq = Term("text", "bravo") q = spans.SpanOr([nq, bq]) m = q.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] assert ("alfa" in orig and "charlie" in orig) or "bravo" in orig m.next()
def test_span_condition(): ix = get_index() with ix.searcher() as s: sc = spans.SpanCondition(Term("text", "alfa"), Term("text", "charlie")) m = sc.matcher(s) while m.is_active(): orig = list(s.stored_fields(m.id())["text"]) assert "alfa" in orig assert "charlie" in orig for span in m.spans(): assert orig[span.start] == "alfa" m.next()
def remove_document(self, path, section='generic'): self._writer.delete_by_query( And([ Term('path', path), Term('section', unicode(section)), ])) content_fn = self._index.get_content_filename(path, section) try: os.remove(content_fn) except OSError: pass
def test_normalize_compound(): def oq(): return Or([Term("a", u("a")), Term("a", u("b"))]) def nq(level): if level == 0: return oq() else: return Or([nq(level - 1), nq(level - 1), nq(level - 1)]) q = nq(7) q = q.normalize() assert_equal(q, Or([Term("a", u("a")), Term("a", u("b"))]))
def get_subitem_revs(self): """ Create a list of subitems of this item. Subitems are in the form of storage Revisions. """ query = And([Term(WIKINAME, app.cfg.interwikiname), Term(NAMESPACE, self.fqname.namespace)]) # trick: an item of empty name can be considered as "virtual root item" # that has all wiki items as sub items if self.names: query = And([query, Or([Prefix(NAME_EXACT, prefix) for prefix in self.subitem_prefixes])]) revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None) return revs
def add_other_versions(searcher, results, user, staff): allow_q = [] if not staff: allow_q = [Or([Term('public', 't'), Term('users', user.username.lower())] + [Term('groups', group.name.lower()) for group in user.groups.all()])] for result in results: user_q = And([Term('vendor_name', '%s/%s' % (result['vendor'], result['name']))] + allow_q) version_results = [h.fields()['version'] for h in searcher.search(user_q)] result['others'] = [v for v in version_results if v != result['version']] return results
def get_subscribers(**meta): """ Get all users that are subscribed to the item :param meta: key/value pairs from item metadata - itemid, name, namespace, tags keys :return: a set of Subscriber objects """ itemid = meta.get(ITEMID) name = meta.get(NAME) namespace = meta.get(NAMESPACE) fqname = CompositeName(namespace, ITEMID, itemid) tags = meta.get(TAGS) terms = [] if itemid is not None: terms.extend( [Term(SUBSCRIPTION_IDS, "{0}:{1}".format(ITEMID, itemid))]) if namespace is not None: if name is not None: terms.extend( Term(SUBSCRIPTION_IDS, "{0}:{1}:{2}".format( NAME, namespace, name_)) for name_ in name) if tags is not None: terms.extend( Term(SUBSCRIPTION_IDS, "{0}:{1}:{2}".format( TAGS, namespace, tag)) for tag in tags) query = Or(terms) with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher: result_iterators = [ searcher.search(query, limit=None), ] subscription_patterns = searcher.lexicon(SUBSCRIPTION_PATTERNS) # looks like whoosh gives us bytes (not str), decode them: subscription_patterns = [ p if isinstance(p, str) else p.decode() for p in subscription_patterns ] patterns = get_matched_subscription_patterns(subscription_patterns, **meta) result_iterators.extend( searcher.documents(subscription_patterns=pattern) for pattern in patterns) subscribers = set() for user in chain.from_iterable(result_iterators): email = user.get(EMAIL) if email: from moin.user import User u = User(uid=user.get(ITEMID)) if u.may.read(fqname): locale = user.get(LOCALE, DEFAULT_LOCALE) subscribers.add( Subscriber(user[ITEMID], user[NAME][0], email, locale)) return subscribers
def search(): print(request.args) search = request.args.get('search') author = request.args.get('author') category = request.args.get('category') page = int(request.args.get( 'page')) if not request.args.get('page') is None else 1 print(search) if search is None and author is None and category is None: myquery = Every() else: if search is None: if not author is None: myquery = Term('author', author) if not category is None: myquery = myquery & Term('category', category) else: myquery = Term('category', category) else: myquery = MultifieldParser(["title", "post_content"], ix.schema, plugins=[FuzzyTermPlugin() ]).parse(search) if not author is None: myquery = myquery & Term('author', author) if not category is None: myquery = myquery & Term('category', category) with ix.searcher() as searcher: results = searcher.search_page(myquery, page, pagelen=25, sortedby="date", reverse=True) print(results.is_last_page()) results_json = json.dumps( { "results": [dict(i) for i in results], "page": page, "total_results": results.total }, default=str) resp = Response(response=results_json, status=200, mimetype="application/json") return resp
def do_show(self, revid): """ Show a blog item and a list of its blog entries below it. If tag GET-parameter is defined, the list of blog entries consists only of those entries that contain the tag value in their lists of tags. """ # for now it is just one tag=value, later it could be tag=value1&tag=value2&... tag = request.values.get('tag') prefix = self.name + u'/' current_timestamp = int(time.time()) terms = [ Term(WIKINAME, app.cfg.interwikiname), # Only blog entry itemtypes Term(ITEMTYPE, ITEMTYPE_BLOG_ENTRY), # Only sub items of this item Prefix(NAME_EXACT, prefix), ] if tag: terms.append(Term(TAGS, tag)) query = And(terms) def ptime_sort_key(searcher, docnum): """ Compute the publication time key for blog entries sorting. If PTIME is not defined, we use MTIME. """ fields = searcher.stored_fields(docnum) ptime = fields.get(PTIME, fields[MTIME]) return ptime ptime_sort_facet = FunctionFacet(ptime_sort_key) revs = flaskg.storage.search(query, sortedby=ptime_sort_facet, reverse=True, limit=None) blog_entry_items = [ Item.create(rev.name, rev_id=rev.revid) for rev in revs ] return render_template( 'blog/main.html', item_name=self.name, fqname=split_fqname(self.name), blog_item=self, blog_entry_items=blog_entry_items, tag=tag, item=self, )
def has_word(self, character_set, key): assert character_set & TRADITIONAL or character_set & SIMPLIFIED with self._index.searcher() as searcher: query = NullQuery() # Documentation for Whoosh says 'in' # operator can be used on the searcher # to look for the key but it didn't work # for me. if character_set & TRADITIONAL: query |= Term("traditional", key) if character_set & SIMPLIFIED: query |= Term('simplified', key) results = searcher.search(query) return len(results) > 0
def get_item_last_revisions(app, fqname): """ Get 2 or less most recent item revisions from the index :param app: local proxy app :param fqname: the fqname of the item :return: a list of revisions """ # TODO: Implement AccessDenied or similar error in case the user does not have access to item # and to also to handle the case where the item has no revisions terms = [Term(WIKINAME, app.cfg.interwikiname), Term(fqname.field, fqname.value), ] query = And(terms) return list( flaskg.storage.search(query, idx_name=ALL_REVS, sortedby=[MTIME], reverse=True, limit=2))
def filter_queryset(self, request, queryset, view): if ('parent' in request.query_params and request.query_params['parent'] == ''): # Empty string means query for null parent queryset = queryset.filter(parent=None) if 'q' not in request.query_params: return queryset queryset_pks = list(queryset.values_list('pk', flat=True)) if not len(queryset_pks): return queryset # 'q' means do a full-text search of the document fields, where the # critera are given in the Whoosh query language: # https://pythonhosted.org/Whoosh/querylang.html search_queryset = SearchQuerySet().models(queryset.model) search_backend = search_queryset.query.backend if not isinstance(search_backend, WhooshSearchBackend): raise NotImplementedError( 'Only the Whoosh search engine is supported at this time') if not search_backend.setup_complete: search_backend.setup() searcher = search_backend.index.searcher() # Parse the user's query user_query = QueryParser('text', search_backend.index.schema).parse( request.query_params['q']) # Construct a query to restrict the search to the appropriate model filter_query = Term(DJANGO_CT, get_model_ct(queryset.model)) # Does the search index for this model have a field that allows # filtering by permissions? haystack_index = haystack.connections[ 'default'].get_unified_index().get_index(queryset.model) if hasattr(haystack_index, 'users_granted_permission'): # Also restrict the search to records that the user can access filter_query &= Term( 'users_granted_permission', request.user.username) results = searcher.search( user_query, filter=filter_query, scored=False, sortedby=None, limit=None ) pk_type = type(queryset_pks[0]) results_pks = { # Coerce each `django_id` from unicode to the appropriate type, # usually `int` pk_type((x['django_id'])) for x in results } filter_pks = results_pks.intersection(queryset_pks) return queryset.filter(pk__in=filter_pks)
def item_acl_report(): """ Return a sorted list of all items in the wiki along with the ACL Meta-data. Item names are prefixed with the namespace, if there is a non-default namespace. If there are multiple names, the first name is used for sorting. """ query = And([ Term(WIKINAME, app.cfg.interwikiname), Not(Term(NAMESPACE, NAMESPACE_USERPROFILES)), ]) all_metas = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=[NAMESPACE, NAME], limit=None) items_acls = [] for meta in all_metas: item_namespace = meta.get(NAMESPACE) item_id = meta.get(ITEMID) if item_namespace: item_name = [ item_namespace + '/' + name for name in meta.get(NAME) ] else: item_name = meta.get(NAME) item_acl = meta.get(ACL) acl_default = item_acl is None if acl_default: for namespace, acl_config in app.cfg.acl_mapping: if item_namespace == namespace: item_acl = acl_config['default'] fqnames = gen_fqnames(meta) fqname = fqnames[0] items_acls.append({ 'name': item_name, 'name_old': meta.get('name_old', []), 'itemid': item_id, 'fqnames': fqnames, 'fqname': fqnames[0], 'acl': item_acl, 'acl_default': acl_default }) # deleted items have no names; this sort places deleted items on top of the report; # the display name may be similar to: "9cf939f ~(DeletedItemName)" items_acls = sorted(items_acls, key=lambda k: (k['name'], k['name_old'])) return render_template('admin/item_acl_report.html', title_name=_('Item ACL Report'), number_items=len(items_acls), items_acls=items_acls)
def itemsize(): """display a table with item sizes""" headings = [ _('Size'), _('Item name'), ] query = And([Term(WIKINAME, app.cfg.interwikiname), Not(Term(NAMESPACE, NAMESPACE_USERPROFILES)), Not(Term(TRASH, True))]) revs = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=[NAME], limit=None) rows = [(rev[SIZE], CompositeName(rev[NAMESPACE], NAME_EXACT, rev[NAME][0])) for rev in revs] rows = sorted(rows, reverse=True) return render_template('user/itemsize.html', title_name=_("Item Sizes"), headings=headings, rows=rows)
def build_filter_terms(field_name, *, include=None, exclude=None): """ Build Whoosh query terms that may be used to filter a search. :param list include: List of values to allow in the search results. If `None`, no inclusion term gets produced. :param list exclude: List of values to deny from the search results. If `None`, no exclusion term gets produced. """ terms = [Or([Term(field_name, value) for value in include])] if include else [] if exclude: terms.extend([Not(Term(field_name, value)) for value in exclude]) return terms
def test_or_nots3(): schema = fields.Schema(title=fields.TEXT(stored=True), itemtype=fields.ID(stored=True)) with TempIndex(schema, "ornot") as ix: w = ix.writer() w.add_document(title=u("a1"), itemtype=u("a")) w.add_document(title=u("a2"), itemtype=u("a")) w.add_document(title=u("b1"), itemtype=u("b")) w.commit() q = Term('itemtype', 'a') | Not(Term('itemtype', 'a')) with ix.searcher() as s: r = " ".join([hit["title"] for hit in s.search(q)]) assert r == "a1 a2 b1"
def user_acl_report(uid): query = And([ Term(WIKINAME, app.cfg.interwikiname), Not(Term(NAMESPACE, NAMESPACE_USERPROFILES)) ]) all_items = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=[NAMESPACE, NAME], limit=None) theuser = user.User(uid=uid) itemwise_acl = [] last_item_acl_parts = (None, None, None) last_item_result = { 'read': False, 'write': False, 'create': False, 'admin': False, 'destroy': False } for item in all_items: if item.meta.get(NAME): fqname = CompositeName(item.meta.get(NAMESPACE), NAME_EXACT, item.meta.get(NAME)[0]) else: fqname = CompositeName(item.meta.get(NAMESPACE), ITEMID, item.meta.get(ITEMID)) this_rev_acl_parts = (item.meta[NAMESPACE], item.meta.get(PARENTNAMES), item.meta.get(ACL)) name_parts = { 'name': item.meta.get(NAME), 'namespace': item.meta.get(NAMESPACE), 'itemid': item.meta.get(ITEMID), 'fqname': fqname } if not last_item_acl_parts == this_rev_acl_parts: last_item_acl_parts = this_rev_acl_parts last_item_result = { 'read': theuser.may.read(fqname), 'write': theuser.may.write(fqname), 'create': theuser.may.create(fqname), 'admin': theuser.may.admin(fqname), 'destroy': theuser.may.destroy(fqname) } itemwise_acl.append({**name_parts, **last_item_result}) return render_template('admin/user_acl_report.html', title_name=_('User ACL Report'), user_names=theuser.name, itemwise_acl=itemwise_acl)
def get_query(line, ix): lines = line.strip().split('\t') post = lines[0].decode('utf-8') q2 = QueryParser("post", ix.schema).parse(post) terms = list(q2.all_terms()) query = Or([Term(*x) for x in terms]) return query
def test_simplify(): s = fields.Schema(k=fields.ID, v=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee")) w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia")) w.commit() r = ix.reader() q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")]) q2 = And([Or([Term('v', u('bear'), boost=2.0), Term('v', u('bee'), boost=2.0), Term('v', u('brie'), boost=2.0)]), Term('v', 'juliet')]) assert_equal(q1.simplify(r), q2)
def more_like(pk, source, top=5): """Find similar units.""" index = get_source_index() with index.searcher() as searcher: # Extract key terms kts = searcher.key_terms_from_text('source', source, numterms=10, normalize=False) # Create an Or query from the key terms query = Or( [Term('source', word, boost=weight) for word, weight in kts]) # Grab fulltext results results = [(h['pk'], h.score) for h in searcher.search(query, limit=top)] if not results: return [], {} # Normalize scores to 0-100 max_score = max([h[1] for h in results]) scores = {h[0]: h[1] * 100 / max_score for h in results} # Filter results with score above 30 and not current unit return ( [h[0] for h in results if scores[h[0]] > 30 and h[0] != pk], scores, )