def test_intersection(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u("a"), value=u("alpha bravo charlie delta")) w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo")) w.add_document(key=u("c"), value=u("charlie delta golf hotel")) w.commit() w = ix.writer() w.add_document(key=u("d"), value=u("india alpha bravo charlie")) w.add_document(key=u("e"), value=u("delta bravo india bravo")) w.commit() with ix.searcher() as s: q = And([Term("value", u("bravo")), Term("value", u("delta"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "e"] q = And([Term("value", u("bravo")), Term("value", u("alpha"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "b", "d"]
def test_regular_and(): ix = get_index() with ix.searcher() as s: aq = And([Term("text", "bravo"), Term("text", "alfa")]) m = aq.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] for span in m.spans(): v = orig[span.start] assert v == "bravo" or v == "alfa" m.next()
def test_simplify(): s = fields.Schema(k=fields.ID, v=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee")) w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia")) w.commit() r = ix.reader() q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")]) q2 = And([Or([Term('v', u('bear'), boost=2.0), Term('v', u('bee'), boost=2.0), Term('v', u('brie'), boost=2.0)]), Term('v', 'juliet')]) assert_equal(q1.simplify(r), q2)
def test_accept(): def boost_phrases(q): if isinstance(q, Phrase): q.boost *= 2.0 return q before = And([Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")])]), Phrase("a", [u("g"), u("h")], boost=0.25)]) after = before.accept(boost_phrases) assert_equal(after, And([Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")], boost=2.0)]), Phrase("a", [u("g"), u("h")], boost=0.5)])) before = Phrase("a", [u("b"), u("c")], boost=2.5) after = before.accept(boost_phrases) assert_equal(after, Phrase("a", [u("b"), u("c")], boost=5.0))
def test_duplicates(): q = And([Term("a", u("b")), Term("a", u("b"))]) assert_equal(q.normalize(), Term("a", u("b"))) q = And([Prefix("a", u("b")), Prefix("a", u("b"))]) assert_equal(q.normalize(), Prefix("a", u("b"))) q = And([Variations("a", u("b")), And([Variations("a", u("b")), Term("a", u("b"))])]) assert_equal(q.normalize(), And([Variations("a", u("b")), Term("a", u("b"))])) q = And([Term("a", u("b")), Prefix("a", u("b")), Term("a", u("b"), boost=1.1)]) assert_equal(q.normalize(), q) # Wildcard without * or ? normalizes to Term q = And([Wildcard("a", u("b")), And([Wildcard("a", u("b")), Term("a", u("b"))])]) assert_equal(q.normalize(), Term("a", u("b")))
def atom(item_name): # Currently atom feeds behave in the fol. way # - Text diffs are shown in a side-by-side fashion # - The current binary item is fully rendered in the feed # - Image(binary)'s diff is shown using PIL # - First item is always rendered fully # - Revision meta(id, size and comment) is shown for parent and current revision query = Term(WIKINAME, app.cfg.interwikiname) if item_name: query = And([ query, Term(NAME_EXACT, item_name), ]) revs = list( flaskg.storage.search(query, idx_name=LATEST_REVS, sortedby=[MTIME], reverse=True, limit=1)) if revs: rev = revs[0] cid = cache_key(usage="atom", revid=rev.revid, item_name=item_name) content = app.cache.get(cid) else: content = None cid = None if content is None: if not item_name: title = "{0}".format(app.cfg.sitename) else: title = "{0} - {1}".format(app.cfg.sitename, item_name) feed = AtomFeed(title=title, feed_url=request.url, url=request.host_url) query = Term(WIKINAME, app.cfg.interwikiname) if item_name: query = And([ query, Term(NAME_EXACT, item_name), ]) history = flaskg.storage.search(query, idx_name=ALL_REVS, sortedby=[MTIME], reverse=True, limit=100) for rev in history: name = rev.name item = rev.item this_revid = rev.meta[REVID] previous_revid = rev.meta.get(PARENTID) this_rev = rev try: hl_item = Item.create(name, rev_id=this_revid) if previous_revid is not None: # HTML diff for subsequent revisions previous_rev = item[previous_revid] content = hl_item.content._render_data_diff_atom( previous_rev, this_rev) else: # full html rendering for new items content = render_template( 'atom.html', get='first_revision', rev=this_rev, content=Markup(hl_item.content._render_data()), revision=this_revid) content_type = 'html' except Exception as e: logging.exception("content rendering crashed") content = _('MoinMoin feels unhappy.') content_type = 'text' author = get_editor_info(rev.meta, external=True) rev_comment = rev.meta.get(COMMENT, '') if rev_comment: # Trim down extremely long revision comment if len(rev_comment) > 80: content = render_template('atom.html', get='comment_cont_merge', comment=rev_comment[79:], content=Markup(content)) rev_comment = "{0}...".format(rev_comment[:79]) feed_title = "{0} - {1}".format(author.get(NAME, ''), rev_comment) else: feed_title = "{0}".format(author.get(NAME, '')) if not item_name: feed_title = "{0} - {1}".format(name, feed_title) feed.add( title=feed_title, title_type='text', summary=content, summary_type=content_type, author=author, url=url_for_item(name, rev=this_revid, _external=True), updated=datetime.fromtimestamp(rev.meta[MTIME]), ) content = feed.to_string() # Hack to add XSLT stylesheet declaration since AtomFeed doesn't allow this content = content.split("\n") content.insert(1, render_template('atom.html', get='xml')) content = "\n".join(content) if cid is not None: app.cache.set(cid, content) return Response(content, content_type='application/atom+xml')
def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): parsed_query = self.parser.parse(query_string) if len(model_choices) > 0: narrow_model = [Term(DJANGO_CT, rm) for rm in model_choices] parsed_query = And([Or(narrow_model), parsed_query]) searcher = self.index.searcher() # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page(start_offset, end_offset) collapse_field = kwargs.get("collapse") collapse_limit = kwargs.get("collapse_limit") search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse } if collapse_field is not None: search_kwargs['collapse'] = FieldFacet(collapse_field) search_kwargs['collapse_limit'] = 1 if kwargs.get("collapse_order") is not None: order = kwargs.get("collapse_order") collapse_order = FieldFacet(order.replace('-', ''), reverse=order.find('-') > -1) search_kwargs['collapse_order'] = collapse_order # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( grouped_results = None if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } if collapse_field is not None and collapse_limit > 1: search_kwargs = { 'sortedby': collapse_order } grouped_results = [] for result in raw_page: query = And([Term(collapse_field, result[collapse_field]), parsed_query]) results = searcher.search(query, limit=collapse_limit, **search_kwargs) grouped_results.append(results) results = self._process_results(raw_page, result_class=result_class, collapse_field=collapse_field, grouped_results=grouped_results) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, }
def search(request): if request.method == 'POST': form = Search_Form(request.POST) if form.is_valid(): if not aux_check_index(): aux_reset_all() key = form.cleaned_data['key_word'].lower() type = form.cleaned_data['type'] ix = open_dir(dirindex) with ix.searcher() as searcher: words = key.strip().split() terms_classified = [] for word in words: terms = [] for desc in [ 'descripcionECI', 'descripcionMM', 'descripcionFC' ]: terms.append(Term(desc, word)) terms_classified.append(terms) subqueries = [] for t in terms_classified: if type == 'N3': subqueries.append(And(t)) else: subqueries.append(Or(t)) query = subqueries[0] if len(subqueries) > 1: if type == 'N1': query = Or(subqueries) else: query = And(subqueries) results = searcher.search(query) title = "Resultados para: " mostrar = True if len(results) == 0: title = "No hay resultados para: " mostrar = False eci = [] mm = [] fc = [] for r in results: eci.append( Historico_ECI.objects.filter( producto_id=r['ean']).order_by("-fecha")[0]) mm.append( Historico_MM.objects.filter( producto_id=r['ean']).order_by("-fecha")[0]) fc.append( Historico_FC.objects.filter( producto_id=r['ean']).order_by("-fecha")[0]) return render( request, 'search.html', { "eci": eci, "mm": mm, 'fc': fc, "title": title + key, "mostrar": mostrar }) else: form = Search_Form() return render(request, 'search.html', {'form': form})
def test_query_copy_hash(): def do(q1, q2): q1a = copy.deepcopy(q1) assert q1 == q1a assert hash(q1) == hash(q1a) assert q1 != q2 do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5)) do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1), And([Term("a", u("b")), Term("c", u("d"))], boost=1.5)) do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]), Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5)) do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]), DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))], boost=1.5)) do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5))) do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5)) do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"), boost=1.5)) do(FuzzyTerm("a", u("b"), constantscore=True), FuzzyTerm("a", u("b"), constantscore=False)) do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5)) do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d"))) do(TermRange("a", None, u("c")), TermRange("a", None, None)) do(TermRange("a", u("b"), u("c"), boost=1.1), TermRange("a", u("b"), u("c"), boost=1.5)) do(TermRange("a", u("b"), u("c"), constantscore=True), TermRange("a", u("b"), u("c"), constantscore=False)) do(NumericRange("a", 1, 5), NumericRange("a", 1, 6)) do(NumericRange("a", None, 5), NumericRange("a", None, None)) do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5)) do(NumericRange("a", 3, 6, constantscore=True), NumericRange("a", 3, 6, constantscore=False)) # do(DateRange) do(Variations("a", u("render")), Variations("a", u("renders"))) do(Variations("a", u("render"), boost=1.1), Variations("a", u("renders"), boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")]), Phrase("a", [u("b"), u("c"), u("e")])) do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1), Phrase("a", [u("b"), u("c"), u("d")], boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")], slop=1), Phrase("a", [u("b"), u("c"), u("d")], slop=2)) # do(Ordered) do(Every(), Every("a")) do(Every("a"), Every("b")) do(Every("a", boost=1.1), Every("a", boost=1.5)) do(NullQuery, Term("a", u("b"))) do(ConstantScoreQuery(Term("a", u("b"))), ConstantScoreQuery(Term("a", u("c")))) do(ConstantScoreQuery(Term("a", u("b")), score=2.0), ConstantScoreQuery(Term("a", u("c")), score=2.1)) do(Require(Term("a", u("b")), Term("c", u("d"))), Require(Term("a", u("b"), boost=1.1), Term("c", u("d")))) # do(Require) # do(AndMaybe) # do(AndNot) # do(Otherwise) do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")), limit=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d"))), SpanNear(Term("a", u("b")), Term("c", u("e")))) do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1), SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1), SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True), SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False)) do(SpanNot(Term("a", u("b")), Term("a", u("c"))), SpanNot(Term("a", u("b")), Term("a", u("d")))) do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]), SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))])) do(SpanContains(Term("a", u("b")), Term("a", u("c"))), SpanContains(Term("a", u("b")), Term("a", u("d"))))
if ored_values_match!=None: ored_values = ored_values_match.group(0) values = [v for v in pattern_val.findall(ored_values) if v!='OR'] values = [v[1:-1] for v in values if (v[0]=='"' and v[-1]=='"')] expr = Or([Term(key, value) for value in values]) else: value = pattern_val.match(kvp[voffset:]).group(0) #print key, value if value[0]=='"' and value[-1]=='"': value = value[1:-1] expr = Term(key,value) except Exception, e: sys.stderr.write('Error parsing %s, Exception: %s'%(s,str(e))) raise Exception('Error Parsing FQ') if expr!=None: exprs.append(expr) return And(exprs) if len(exprs)>0 else None def to_solr_format(res, start): solr_res={'responseHeader':{}} solr_res['responseHeader'].update({'QTime':0.0}) solr_res.update({'response':{}}) solr_res['response'].update({'numFound':0}) solr_res['response'].update({'start':start}) solr_res['response'].update({'maxScore':0.0}) solr_res['response'].update({'docs':[]}) for r in res: solr_res['response']['docs'].append({ 'url':r['url'], 'product_title':r['title'], 'raw_catpred':r['catpred'], 'sku':r['sku'],
def run(self, directory='HTML', theme='topside_cms', exclude_ns='userprofiles', user=None, query=None): if theme: app.cfg.user_defaults[THEME_NAME] = theme exclude_ns = exclude_ns.split(',') if exclude_ns else [] before_wiki() norm = os.path.normpath join = os.path.join if '/' in directory: # user has specified complete path to root html_root = directory else: html_root = norm(join(app.cfg.wikiconfig_dir, directory)) repo_root = norm(join(app.cfg.wikiconfig_dir)) moinmoin = norm(join(app.cfg.wikiconfig_dir, 'src', 'moin')) # override ACLs with permission to read all items for namespace, acls in app.cfg.acl_mapping: acls['before'] = 'All:read' # create an empty output directory after deleting any existing directory print u'Creating output directory {0}, starting to copy supporting files'.format( html_root) if os.path.exists(html_root): shutil.rmtree(html_root, ignore_errors=False) else: os.makedirs(html_root) # create subdirectories and copy static css, icons, images into "static" subdirectory shutil.copytree(norm(join(moinmoin, 'static')), norm(join(html_root, 'static'))) shutil.copytree(norm(join(repo_root, 'wiki_local')), norm(join(html_root, '+serve/wiki_local'))) # copy files from xstatic packaging into "+serve" subdirectory pkg = app.cfg.pkg xstatic_dirs = [ 'font_awesome', 'jquery', 'jquery_tablesorter', 'autosize' ] if theme in [ 'basic', ]: xstatic_dirs.append('bootstrap') for dirs in xstatic_dirs: xs = XStatic(getattr(pkg, dirs), root_url='/static', provider='local', protocol='http') shutil.copytree(xs.base_dir, norm(join(html_root, '+serve', dirs))) # copy directories for theme's static files theme = app.cfg.user_defaults[THEME_NAME] if theme == 'topside_cms': # topside_cms uses topside CSS files from_dir = norm(join(moinmoin, 'themes/topside/static')) else: from_dir = norm(join(moinmoin, 'themes', theme, 'static')) to_dir = norm(join(html_root, '_themes', theme)) shutil.copytree(from_dir, to_dir) # convert: <img alt="svg" src="/+get/+7cb364b8ca5d4b7e960a4927c99a2912/svg" /> # to: <img alt="svg" src="+get/svg" /> invalid_src = re.compile(r' src="/\+get/\+[0-9a-f]{32}/') valid_src = u' src="+get/' # get ready to render and copy individual items names = [] home_page = None get_dir = norm(join( html_root, '+get')) # images and other raw data from wiki content os.makedirs(get_dir) if query: q = And([ Term(WIKINAME, app.cfg.interwikiname), Regex(NAME_EXACT, query) ]) else: q = Every() print 'Starting to dump items' for current_rev in app.storage.search(q, limit=None, sortedby="name"): if current_rev.namespace in exclude_ns: # we usually do not copy userprofiles, no one can login to a static wiki continue if not current_rev.name: # TODO: we skip nameless tickets, but named tickets and comments are processed with ugly names continue try: item_name = current_rev.fqname.fullname rendered = show_item( item_name, CURRENT) # @@@ userid is needed for acls here # convert / characters in sub-items and namespaces and save names for index file_name = item_name.replace('/', SLASH) filename = norm(join(html_root, file_name)) names.append(file_name) except Forbidden: print u'Failed to dump {0}: Forbidden'.format(current_rev.name) continue except KeyError: print u'Failed to dump {0}: KeyError'.format(current_rev.name) continue if not isinstance(rendered, unicode): print u'Rendering failed for {0} with response {1}'.format( file_name, rendered) continue # make hrefs relative to current folder rendered = rendered.replace('href="/', 'href="') rendered = rendered.replace('src="/static/', 'src="static/') rendered = rendered.replace('src="/+serve/', 'src="+serve/') rendered = rendered.replace( 'href="+index/"', 'href="+index"') # trailing slash changes relative position rendered = rendered.replace( '<a href="">', u'<a href="{0}">'.format( app.cfg.default_root)) # TODO: fix basic theme # remove item ID from: src="/+get/+7cb364b8ca5d4b7e960a4927c99a2912/svg" rendered = re.sub(invalid_src, valid_src, rendered) rendered = self.subitems(rendered) # copy raw data for all items to output /+get directory; images are required, text items are of marginal/no benefit item = app.storage[current_rev.name] rev = item[CURRENT] with open(get_dir + '/' + file_name, 'wb') as f: shutil.copyfileobj(rev.data, f) # save rendered items or raw data to dump directory root contenttype = item.meta['contenttype'].split(';')[0] if contenttype in CONTENTTYPE_MEDIA and filename.endswith( CONTENTTYPE_MEDIA_SUFFIX): # do not put a rendered html-formatted file with a name like video.mp4 into root; browsers want raw data with open(filename, 'wb') as f: rev.data.seek(0) shutil.copyfileobj(rev.data, f) print u'Saved file named {0} as raw data'.format( filename).encode('utf-8') else: with open(filename, 'wb') as f: f.write(rendered.encode('utf8')) print u'Saved file named {0}'.format(filename).encode( 'utf-8') if current_rev.name == app.cfg.default_root: # make duplicates of home page that are easy to find in directory list and open with a click for target in [(current_rev.name + '.html'), ('_' + current_rev.name + '.html')]: with open(norm(join(html_root, target)), 'wb') as f: f.write(rendered.encode('utf8')) home_page = rendered # save a copy for creation of index page if home_page: # create an index page by replacing the content of the home page with a list of items # work around differences in basic and modernized theme layout # TODO: this is likely to break as new themes are added if theme == 'basic': start = '<div class="moin-content" role="main">' # basic end = '<footer class="navbar moin-footer">' div_end = '</div>' else: start = '<div id="moin-content">' # modernized , topside, topside cms end = '<footer id="moin-footer">' div_end = '</div></div>' # build a page named "+index" containing links to all wiki items ul = u'<h1>Index</h1><ul>{0}</ul>' li = u'<li><a href="{0}">{1}</a></li>' links = [] names.sort() for name in names: links.append(li.format(name, name.replace(SLASH, '/'))) name_links = ul.format(u'\n'.join(links)) try: part1 = home_page.split(start)[0] part2 = home_page.split(end)[1] page = part1 + start + name_links + div_end + end + part2 except IndexError: page = home_page print u'Error: failed to find {0} in item named {1}'.format( end, app.cfg.default_root) for target in ['+index', '_+index.html']: with open(norm(join(html_root, target)), 'wb') as f: f.write(page.encode('utf8')) else: print 'Error: no item matching name in app.cfg.default_root was found'
def __init__(self, a, b): self.a = a self.b = b self.q = And([a, b])
def test_random_intersections(): domain = [ u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike"), ] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) # Create docsperseg * segments documents containing random words from # the domain list. Add the documents to the index, but also keep them # in the "documents" list for the sanity check for i in xrange(segments): w = ix.writer() for j in xrange(docsperseg): docnum = i * docsperseg + j # Create a string of random words doc = u(" ").join(choice(domain) for _ in xrange(randint(*fieldlimits))) # Add the string to the index w.add_document(key=docnum, value=doc) # Add a (docnum, string) tuple to the documents list documents.append((docnum, doc)) w.commit() assert_not_equal(len(ix._segments()), 1) testcount = 20 testlimits = (2, 5) with ix.searcher() as s: for i in xrange(s.doc_count_all()): assert_not_equal(s.stored_fields(i).get("key"), None) for _ in xrange(testcount): # Create a random list of words and manually do an intersection of # items in "documents" that contain the words ("target"). words = sample(domain, randint(*testlimits)) target = [] for docnum, doc in documents: if all((doc.find(w) > -1) for w in words): target.append(docnum) target.sort() # Create a query from the list of words and get two matchers from # it. q = And([Term("value", w) for w in words]) m1 = q.matcher(s) m2 = q.matcher(s) # Try getting the list of IDs from all_ids() ids1 = list(m1.all_ids()) # Try getting the list of IDs using id()/next() ids2 = [] while m2.is_active(): ids2.append(m2.id()) m2.next() # Check that the two methods return the same list assert_equal(ids1, ids2) # Check that the IDs match the ones we manually calculated assert_equal(_keys(s, ids1), target)
def recurse(self, elem, page_href): # on first call, elem.tag.name=='page'. # Descendants (body, div, p, include, page, etc.) are processed by recursing through DOM # stack is used to detect transclusion loops page_href_new = elem.get(moin_page.page_href) if page_href_new: page_href_new = Iri(page_href_new) if page_href_new != page_href: page_href = page_href_new self.stack.append(page_href) else: self.stack.append(None) else: self.stack.append(None) try: if elem.tag == xinclude.include: # we have already recursed several levels and found a transclusion: "{{SomePage}}" or <<Include(...)>> # process the transclusion and add it to the DOM. Subsequent recursions will traverse through # the transclusion's elements. href = elem.get(xinclude.href) xpointer = elem.get(xinclude.xpointer) xp_include_pages = None xp_include_sort = None xp_include_items = None xp_include_skipitems = None xp_include_heading = None xp_include_level = None if xpointer: # we are working on an <<Include(abc)>> macro, not a {{transclusion}} xp = XPointer(xpointer) xp_include = None xp_namespaces = {} for entry in xp: uri = None name = entry.name.split(':', 1) if len(name) > 1: prefix, name = name uri = xp_namespaces.get(prefix, False) else: name = name[0] if uri is None and name == 'xmlns': d_prefix, d_uri = entry.data.split('=', 1) xp_namespaces[d_prefix] = d_uri elif uri == moin_page.namespace and name == 'include': xp_include = XPointer(entry.data) if xp_include: for entry in xp_include: name, data = entry.name, entry.data_unescape # TODO: These do not include all parameters in moin 1.9 Include macro docs: # <<Include(pagename, heading, level, from="regex", to="regex", sort=ascending|descending, items=n, skipitems=n, titlesonly, editlink)>> # these are currently unsupported in moin 2.0: from, to, titlesonly, editlink if name == 'pages': # pages == pagename in moin 1.9 xp_include_pages = data elif name == 'sort': xp_include_sort = data elif name == 'items': xp_include_items = int(data) elif name == 'skipitems': xp_include_skipitems = int(data) elif name == 'heading': xp_include_heading = data elif name == 'level': xp_include_level = data included_elements = [] if href: # We have a single page to transclude or include href = Iri(href) link = Iri(scheme='wiki', authority='') if href.scheme == 'wiki': if href.authority: raise ValueError( "can't handle xinclude for non-local authority" ) else: path = href.path[1:] elif href.scheme == 'wiki.local': page = page_href path = href.path if path[0] == '': # /subitem tmp = page.path[1:] tmp.extend(path[1:]) path = tmp elif path[0] == '..': # ../sisteritem path = page.path[1:] + path[1:] else: raise ValueError( "can't handle xinclude for schemes other than wiki or wiki.local" ) link.path = path if flaskg.user.may.read(unicode(path)): page = Item.create(unicode(path)) pages = ((page, link), ) else: # ACLs prevent user from viewing a transclusion - show message message = moin_page.p(children=(_( 'Access Denied, transcluded content suppressed.'))) attrib = {html.class_: 'warning'} div = ET.Element(moin_page.div, attrib, children=(message, )) container = ET.Element(moin_page.body, children=(div, )) return [ container, 0 ] # replace transclusion with container's child elif xp_include_pages: # we have regex of pages to include: <<Include(^qqq)>> query = And([ Term(WIKINAME, app.cfg.interwikiname), Regex(NAME_EXACT, xp_include_pages) ]) reverse = xp_include_sort == 'descending' results = flaskg.storage.search(query, sortedby=NAME_EXACT, reverse=reverse, limit=None) pagelist = [result.name for result in results] if xp_include_skipitems is not None: pagelist = pagelist[xp_include_skipitems:] if xp_include_items is not None: pagelist = pagelist[xp_include_items + 1:] pages = ((Item.create(p), Iri(scheme='wiki', authority='', path='/' + p)) for p in pagelist) if not pagelist: msg = _( 'Error: no items found matching "<<Include({0})>>"' ).format(xp_include_pages) attrib = {html.class_: 'moin-error'} strong = ET.Element(moin_page.strong, attrib, (msg, )) included_elements.append(strong) for page, p_href in pages: if p_href.path[0] != '/': p_href.path = IriPath('/' + '/'.join(p_href.path)) if p_href in self.stack: # we have a transclusion loop, create an error message showing list of pages forming loop loop = self.stack[self.stack.index(p_href):] loop = [ u'{0}'.format(ref.path[1:]) for ref in loop if ref is not None ] + [page.name] msg = u'Error: Transclusion loop via: ' + u', '.join( loop) attrib = {html.class_: 'moin-error'} strong = ET.Element(moin_page.strong, attrib, (msg, )) included_elements.append(strong) continue if xp_include_heading is not None: attrib = {xlink.href: p_href} children = (xp_include_heading or page.name, ) elem_a = ET.Element(moin_page.a, attrib, children=children) attrib = { moin_page.outline_level: xp_include_level or '1' } elem_h = ET.Element(moin_page.h, attrib, children=(elem_a, )) included_elements.append(elem_h) page_doc = page.content.internal_representation( attributes=Arguments(keyword=elem.attrib)) if isinstance(page.rev.data, file): page.rev.data.close() self.recurse(page_doc, page_href) # The href needs to be an absolute URI, without the prefix "wiki://" page_doc = mark_item_as_transclusion(page_doc, p_href.path) included_elements.append(page_doc) if len(included_elements) > 1: # use a div as container result = ET.Element(moin_page.div) result.extend(included_elements) elif included_elements: result = included_elements[0] else: result = None # end of processing for transclusion; the "result" will get inserted into the DOM below return result # Traverse the DOM by calling self.recurse with each child of the current elem. # Starting elem.tag.name=='page'. container = [] i = 0 while i < len(elem): child = elem[i] if isinstance(child, ET.Node): ret = self.recurse(child, page_href) if ret: # Either child or a descendant of child is a transclusion. # See top of this script for notes on why these DOM adjustments are required. if isinstance(ret, ET.Node ) and elem.tag.name in NO_BLOCK_CHILDREN: body = ret[0] if len(body) == 0: # the transcluded item is empty, insert an empty span into DOM attrib = Attributes(ret).convert() elem[i] = ET.Element(moin_page.span, attrib=attrib) elif (isinstance(body[0], ET.Node) and (len(body) > 1 or body[0].tag.name not in ('p', 'object', 'a'))): # Complex case: "some text {{BlockItem}} more text" or "\n{{BlockItem}}\n" where # the BlockItem body contains multiple p's, a table, preformatted text, etc. # These block elements cannot be made a child of the current elem, so we create # a container to replace elem. # Create nodes to hold any siblings before and after current child (elem[i]) before = copy.deepcopy(elem) after = copy.deepcopy(elem) before[:] = elem[0:i] after[:] = elem[i + 1:] if len(before): # there are siblings before transclude, save them in container container.append(before) new_trans_ptr = len(container) # get attributes from page node; # we expect {class: "moin-transclusion"; data-href: "http://some.org/somepage"} attrib = Attributes(ret).convert() # current elem will likely be replaced by container so we need to copy data-lineno attr if html.data_lineno in elem.attrib: attrib[html.data_lineno] = elem.attrib[ html.data_lineno] # make new div node to hold transclusion, copy children, and save in container div = ET.Element(moin_page.div, attrib=attrib, children=body[:]) container.append( div) # new_trans_ptr is index to this if len(after): container.append(after) if elem.tag.name == 'a': # invalid input [[MyPage|{{BlockItem}}]], # best option is to retain A-tag and fail html validation # TODO: error may not be obvious to user - add error message elem[i] = div else: # move up 1 level in recursion where elem becomes the child and # is usually replaced by container return [container, new_trans_ptr] else: # default action for inline transclusions or odd things like circular transclusion error messages classes = child.attrib.get(html.class_, '').split() classes += ret.attrib.get(html.class_, '').split() ret.attrib[html.class_] = ' '.join(classes) elem[i] = ret elif isinstance(ret, types.ListType): # a container has been returned. # Note: there are multiple places where a container may be constructed ret_container, trans_ptr = ret # trans_ptr points to the transclusion within ret_container. # Here the transclusion will always contain a block level element if elem.tag.name in NO_BLOCK_CHILDREN: # Complex case, transclusion effects grand-parent, great-grand-parent, e.g.: # "/* comment {{BlockItem}} */" or "text ''italic {{BlockItem}} italic'' text" # elem is an inline element, build a bigger container to replace elem's parent, before = copy.deepcopy(elem) after = copy.deepcopy(elem) before[:] = elem[0:i] + ret_container[ 0:trans_ptr] after[:] = ret_container[trans_ptr + 1:] + elem[i + 1:] if len(before): container.append(before) new_trans_ptr = len(container) # child may have classes like "comment" that must be added to transcluded element classes = child.attrib.get( moin_page.class_, '').split() # must use moin_page.class_ above, but use html.class below per html_out.py code classes += ret_container[trans_ptr].attrib.get( html.class_, '').split() ret_container[trans_ptr].attrib[ html.class_] = ' '.join(classes) container.append(ret_container[trans_ptr] ) # the transclusion if len(after): container.append(after) return [container, new_trans_ptr] else: # elem is a block element for grandchild in child: if isinstance( grandchild, ET.Node ) and grandchild.tag.name == u'include': # the include may have classes that must be added to transcluded element classes = grandchild.attrib.get( html.class_, '').split() classes += ret_container[ trans_ptr].attrib.get( html.class_, '').split() ret_container[trans_ptr].attrib[ html.class_] = ' '.join(classes) # replace child element with the container generated in lower recursion elem[i:i + 1] = ret_container # elem[i] is the child else: # default action for any ret not fitting special cases above, # e.g. tranclusion is within a table cell elem[i] = ret # we are finished with this child, advance to next sibling i += 1 finally: self.stack.pop()
def test_random_intersections(): domain = [u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike")] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) # Create docsperseg * segments documents containing random words from # the domain list. Add the documents to the index, but also keep them # in the "documents" list for the sanity check for i in xrange(segments): w = ix.writer() for j in xrange(docsperseg): docnum = i * docsperseg + j # Create a string of random words doc = u(" ").join(choice(domain) for _ in xrange(randint(*fieldlimits))) # Add the string to the index w.add_document(key=docnum, value=doc) # Add a (docnum, string) tuple to the documents list documents.append((docnum, doc)) w.commit() assert len(ix._segments()) != 1 testcount = 20 testlimits = (2, 5) with ix.searcher() as s: for i in xrange(s.doc_count_all()): assert s.stored_fields(i).get("key") is not None for _ in xrange(testcount): # Create a random list of words and manually do an intersection of # items in "documents" that contain the words ("target"). words = sample(domain, randint(*testlimits)) target = [] for docnum, doc in documents: if all((doc.find(w) > -1) for w in words): target.append(docnum) target.sort() # Create a query from the list of words and get two matchers from # it. q = And([Term("value", w) for w in words]) m1 = q.matcher(s) m2 = q.matcher(s) # Try getting the list of IDs from all_ids() ids1 = list(m1.all_ids()) # Try getting the list of IDs using id()/next() ids2 = [] while m2.is_active(): ids2.append(m2.id()) m2.next() # Check that the two methods return the same list assert ids1 == ids2 # Check that the IDs match the ones we manually calculated assert _keys(s, ids1) == target
def search(self, string, limit=25, facet=None, proxy=True, boosts=None, filter=None, mask=None): from ..database import get_activity lowercase = lambda x: x.lower() if hasattr(x, "lower") else x string = lowercase(string) fields = [ "name", "comment", "product", "categories", "location", ] boosts = boosts or { "name": 5, "comment": 1, "product": 3, "categories": 2, "location": 3, } qp = MultifieldParser( fields, self.index.schema, fieldboosts=boosts ) kwargs = {'limit': limit} if filter is not None: assert isinstance(filter, dict), "`filter` must be a dictionary" for k in filter: assert k in fields, "`filter` field {} not in search schema".format(k) if len(filter) == 1: kwargs["filter"] = [Term(k, lowercase(v)) for k, v in filter.items()][0] else: kwargs["filter"] = And([Term(k, lowercase(v)) for k, v in filter.items()]) if mask is not None: assert isinstance(mask, dict), "`mask` must be a dictionary" for k in mask: assert k in fields, "`mask` field {} not in search schema".format(k) if len(mask) == 1: kwargs["mask"] = [Term(k, lowercase(v)) for k, v in mask.items()][0] else: kwargs["mask"] = And([Term(k, lowercase(v)) for k, v in mask.items()]) with self.index.searcher() as searcher: if facet is None: results = searcher.search(qp.parse(string), **kwargs) if 'mask' in kwargs or 'filter' in kwargs: print("Excluding {} filtered results".format(results.filtered_count)) results = [dict(obj.items()) for obj in results] else: kwargs.pop('limit') results = { k: [searcher.stored_fields(i) for i in v] for k, v in searcher.search( qp.parse(string), groupedby=facet, **kwargs ).groups().items()} if proxy and facet is not None: return {key: [get_activity((obj['database'], obj['code'])) for obj in value] for key, value in results.items()} elif proxy: return [get_activity((obj['database'], obj['code'])) for obj in results] else: return results
def advanced_query(parameters, page = 0, n = 10): """ :param dict text_paramters: a dictionary of field-to-query pairs specifying text-based queries. this is good for fields like "rules_text", "name", "flavor_text", etc. :param dict range_parameters: a dictionary of field to range pairs. this is good for fields like power, toughness, cmc, etc. :param dict point_parameters: a dictionary of field to value parameters. Every card in the return set must have an exact match to every value in the dict. In example, if point_parameters is {'cmc': 5} then for every card in the set card.cmc == 5 must evaluate to true. .. warning:: using this parameter will cause the query system to filter through whoosh results, slowing down computation. :param int page: the 'page' of results to return :param int n: the number of results per page. :return: Exact class TBD, will provide way to iterate over the page's worth of results. """ import whoosh.fields from whoosh.query import And, Or schema = get_whoosh_index().schema # fix `page` and `n` (they may be string versions of ints) page = int(page) n = int(n) # After talking with Ben it sounds like we can do something to the effect # of taking multiple sub queries and perform unions and intersections on their # results # This is going to be the best way to get the desired results. # to start: build a list of all the query objects we'll be searching. query_objs = [] for field, target in parameters.items(): # Coerce potential numeric point queries to whoosh syntax. if isinstance(target, float): target = int(target+0.5) if isinstance(target, int): target = str(target) #target = f"{{{target-1} TO {target+1}}}" #target = target.replace("[ TO", "[TO").replace("TO ]", "TO]") # Coerce range queries to whoosh syntax, assume they're inclusive bounds. if isinstance(target, (list, tuple)): if len(target) != 2: raise ValueError(f"Unable to treat parameter as range query! ({target})") target = f"[{target[0] if target[0] != -1 else ''} TO {target[1] if target[1] != -1 else ''}]" # whoosh has issues if there's an open ended range with a space separating TO from the bracket: target = target.replace("[ TO", "[TO").replace("TO ]", "TO]") # the comma-separated KEYWORD fields have been giving us some issue # it seems that whoosh is a bit bi-polar when it comes to commas in these fields. # so we'll add two subqueries, one with a comma and one without. if field in schema and isinstance(schema[field], whoosh.fields.KEYWORD): # add the extra query object: subqueries = [QueryParser(field, schema).parse(target.lower()+','), QueryParser(field, schema).parse(target.lower())] query_objs.append(Or(subqueries)) else: query_objs.append(QueryParser(field, schema).parse(target.lower())) # again, lower capitalization on everything if not len(query_objs): return [] # now build a nice big compound query: query = And(query_objs) with get_whoosh_index().searcher() as searcher: # run that query and return the appropriate results page. try: results = searcher.search_page(query, page+1, n) except Exception: print(repr(query)) raise return [x['data_obj'] for x in results]
def getAnnotations(self, commentText): tmpRes = {} if commentText == '': return [] procCommText, origIndx = compressStr(commentText, lower=True) termArr = procCommText.split() # There might be repeating entries orgNames = set() for qs in range(0, len(termArr), self.queryStride): qe = min(qs + self.querySpan, len(termArr)) q = [] for i in range(qs, qe - 1): if not termArr[i] in STOP_WORDS: bigram = And([ Term(self.fieldName, termArr[i]), Term(self.fieldName, termArr[i + 1]) ]) q.append(bigram) #print('@@', ' '.join(termArr[qs:qe])) #print('Query: ', q) res = self.searcher.search(Or(q), limit=self.topK) #print('Found %d results' % len(res)) for k in range(len(res)): if k >= self.topK: break orgName = res[k][self.fieldName] orgNames.add(orgName) for orgName in orgNames: start = 0 while start < len(procCommText): indx = procCommText.find(orgName, start) #print('###', orgName, start, indx) if indx == -1: break assert (indx + len(orgName) <= len(origIndx)) start = indx + len(orgName) # To be a valid match startChar = origIndx[indx] endChar = origIndx[indx + len(orgName) - 1] + 1 # TODO additional condtitions for spaces!! if startChar >= 0 and endChar >= 0: if startChar in tmpRes: tmpRes[startChar] = max(tmpRes[startChar], endChar) else: tmpRes[startChar] = endChar resAnnot = [] for startChar in tmpRes: endChar = tmpRes[startChar] resAnnot.append(Annotation(startChar, endChar, 'OrgDict')) return resAnnot
writer.add_document(id=u'guten02', path=u'gutenberg/austen-persuasion.txt', source=u'austen-persuasion.txt', author=u'Jane Austen', title=u'Chapter 1', text=io.open('gutenberg/austen-persuasion.txt', encoding='utf-8').read()) writer.add_document( id=u'guten03', path=u'gutenberg/blake-poems.txt', source=u'blake-poems.txt', author=u'William Blake', title=u'SONGS OF INNOCENCE AND OF EXPERIENCE and THE BOOK of THEL', text=io.open('gutenberg/austen-persuasion.txt', encoding='utf-8').read()) writer.commit() #=============Query=========== index = open_dir("index") searcher = index.searcher() query = And([Term("text", "song"), Term("text", "wild")]) results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0]) parser = QueryParser("text", index.schema) parser.parse("song wild person") parser.parse("(song OR wild) AND (song OR austen)") parser.parse("song wild author:'William Blake'")
def restrict_query(self, request): return Or([ And([Term('public', 't'), Term('searchable', 't')]), Term('users', request.user.username) ] + [Term('groups', group.name) for group in request.user.groups.all()])
if not os.path.exists('index'): #如果目录index不存在则创建 os.mkdir('index') ix = create_in("index",schema) #按照schema模式建立索引目录 ix = open_dir("index") #打开该目录以便存储索引文件 import pymysql db = pymysql.connect(host='localhost',user="******",password='******',database='lol',port=3306,charset='utf8') cursor = db.cursor() sql = "select * from test" try: cursor.execute(sql) row = cursor.fetchall() except: print("error") writer = ix.writer() for i in range(10): path = row[i][2] title = row[i][10] content = row[i][5] writer.add_document(path=path,title=title,content=content) writer.commit() from whoosh.qparser import QueryParser from whoosh.query import And, Or, Term ix = open_dir("index") with ix.searcher() as searcher: query = And([Term('content',u'框架'), Term('content', u'服务')]) # 解析查询字符串后,生成一个`query`对象。 res = searcher.search(query) print(res[0])
def perform_search(self, sentence): with self._searcher() as s: tokens = sentence.split() tokens = [token for token in tokens if token != REPLACED] print('tokens=', tokens) exact_and_match = And([Term(TEXT_FIELD, t) for t in tokens], boost=.5) exact_or_match = Or([Term(TEXT_FIELD, t) for t in tokens], boost=.5, scale=0.9) # Added variability of maxdist based on word length fuzzy_or_match = Or([ FuzzyTerm(TEXT_FIELD, t, prefixlength=1, maxdist=1 if len(t) < 8 else 2) for t in tokens if len(t) >= 4 ], boost=.2, scale=0.9) if len(tokens) > 1: # add bigrams if there are any bigrams = ['_'.join(b) for b in find_ngrams(tokens, 2)] bigram_fuzzy_or_match = Or([ FuzzyTerm(BIGRAMS_FIELD, b, prefixlength=3, maxdist=2 if len(b) < 8 else 3) for b in bigrams ], scale=0.9) else: bigram_fuzzy_or_match = None non_brand_or_match = Or( [Term(NONBRAND_TEXT_FIELD, t) for t in tokens]) # q = exact_and_match \ # | exact_or_match \ # | fuzzy_or_match # my_match = Or([Term(f, token) for token in tokens], boost=1) # q = my_match # # q = Or([FuzzyTerm(f, token, prefixlength=2) for token in tokens if len(token) >= 3], boost=1.0, # scale=0.9) q = exact_and_match | exact_or_match | fuzzy_or_match | non_brand_or_match if bigram_fuzzy_or_match: q = q | bigram_fuzzy_or_match print(q) search_results = self.get_search_results(self._index, s, q) for x in search_results: print(x, x.score) if search_results: score, text, matched = search_results[0].items() return text, list(set(matched)) else: return None, None
def search(self, role, category, title, prev_turns): query = self.get_query(prev_turns[-1 * self.context_size:], title) # Only consider buyer/seller utterances filter_query = And( [Term('role', unicode(role)), Term('category', unicode(category))]) start_time = time.time() with self.ix.searcher() as searcher: results = searcher.search(query, filter=filter_query, limit=self.num_candidates, terms=True) # One more try if len(results) == 0: query = self.get_query( prev_turns[-1 * (self.context_size + 1):], title) results = searcher.search(query, filter=filter_query, limit=self.num_candidates, terms=True) results = self.remove_duplicates(results) results = [{ 'response': r['response'], 'context': r['immediate_context'], 'hits': [x[1] for x in r.matched_terms()], 'pos': r['pos'], } for r in results] # Sort by BLEU ref = self.process_turn(prev_turns[-1]).split() results = sorted(results, key=lambda r: compute_bleu(r['context'], ref), reverse=True) offered = markers.OFFER in prev_turns[-1] if not offered: results = [ r for r in results if not (markers.ACCEPT in r['response'] or markers.REJECT in r['response']) ] else: results = [ r for r in results if (markers.ACCEPT in r['response'] or markers.REJECT in r['response']) ] if len(results) == 0: results.append({ 'response': [markers.ACCEPT], 'context': [], 'hits': [] }) results.append({ 'response': [markers.REJECT], 'context': [], 'hits': [] }) n = len(results) if n == 0: self.num_empty += 1 #if n < self.num_candidates: # results.extend([{} for _ in xrange(self.num_candidates - n)]) self.num_query += 1 self.search_time += (time.time() - start_time) return results
def test_replace(): q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0)]) q = q.replace("a", "b", "BB") assert q == And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2), Variations("a", "BB", boost=2.0)])
def test_merge_ranges(): q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))]) assert q.normalize() == TermRange("f1", u("a"), u("z")) q = And([NumericRange("f1", None, u("aaaaa")), NumericRange("f1", u("zzzzz"), None)]) assert q.normalize() == q q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")]) assert q.normalize() == TermRange("f1", u("a"), u("z")) q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert q.normalize() == TermRange("f1", u("f"), u("m")) q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert q.normalize() == TermRange("f1", u("a"), u("q")) q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert q.normalize() == Every("f1") q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")]) assert q.normalize() == Every("f1") q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert q.normalize() == Every("f1") q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")]) assert q.normalize() == Every("f1") q = And([Term("f1", u("a")), And([Or([Every("f1")])])]) assert q.normalize() == Every("f1")
def test_merge_ranges(): q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("z"))) q = And([NumericRange("f1", None, u("aaaaa")), NumericRange("f1", u("zzzzz"), None)]) assert_equal(q.normalize(), q) q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("z"))) q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert_equal(q.normalize(), TermRange("f1", u("f"), u("m"))) q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("q"))) q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert_equal(q.normalize(), Every("f1")) q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")]) assert_equal(q.normalize(), Every("f1")) q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert_equal(q.normalize(), Every("f1")) q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")]) assert_equal(q.normalize(), Every("f1")) q = And([Term("f1", u("a")), And([Or([Every("f1")])])]) assert_equal(q.normalize(), Every("f1"))
def test_duplicates(): q = And([Term("a", u("b")), Term("a", u("b"))]) assert q.normalize() == Term("a", u("b")) q = And([Prefix("a", u("b")), Prefix("a", u("b"))]) assert q.normalize() == Prefix("a", u("b")) q = And([Variations("a", u("b")), And([Variations("a", u("b")), Term("a", u("b"))])]) assert q.normalize() == And([Variations("a", u("b")), Term("a", u("b"))]) q = And([Term("a", u("b")), Prefix("a", u("b")), Term("a", u("b"), boost=1.1)]) assert q.normalize() == q # Wildcard without * or ? normalizes to Term q = And([Wildcard("a", u("b")), And([Wildcard("a", u("b")), Term("a", u("b"))])]) assert q.normalize() == Term("a", u("b"))
def test_replace(): q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0)]) q = q.replace("a", "b", "BB") assert_equal(q, And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2), Variations("a", "BB", boost=2.0)]))
# Write a python program that takes queries (you need to design the supported queries) # and search through the indexed archive using whoosh. A sample query to the program # can be: RT:yes, keywords returns all the retweets that are related to the keywords. # Your program should handle at least 4 queries ( of your choice) similar to the sample query. from whoosh.query import Term, And, Or from whoosh.qparser import QueryParser searcher = index.searcher() parser = QueryParser("strong_hashtags", index.schema) parser.parse("FIFAWWC USA JPN") # Query 1: Player search query = And([Term("tweet_text","tobin"),Term("tweet_text","heath")]) results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0]) # Query 2: Player search query = And([Term("tweet_text","alex"),Term("tweet_text","morgan")]) results = searcher.search(query) print('# of hits:', len(results)) print('Best Match:', results[0]) # Query 3: USA JPN parser = QueryParser("strong_hashtags", index.schema) query = parser.parse("USA JPN") results = searcher.search(query) print('# of hits:', len(results))