Example #1
0
def test_intersection():
    schema = fields.Schema(key=fields.ID(stored=True),
                           value=fields.TEXT(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    w.add_document(key=u("a"), value=u("alpha bravo charlie delta"))
    w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo"))
    w.add_document(key=u("c"), value=u("charlie delta golf hotel"))
    w.commit()

    w = ix.writer()
    w.add_document(key=u("d"), value=u("india alpha bravo charlie"))
    w.add_document(key=u("e"), value=u("delta bravo india bravo"))
    w.commit()

    with ix.searcher() as s:
        q = And([Term("value", u("bravo")), Term("value", u("delta"))])
        m = q.matcher(s)
        assert _keys(s, m.all_ids()) == ["a", "e"]

        q = And([Term("value", u("bravo")), Term("value", u("alpha"))])
        m = q.matcher(s)
        assert _keys(s, m.all_ids()) == ["a", "b", "d"]
Example #2
0
def test_regular_and():
    ix = get_index()
    with ix.searcher() as s:
        aq = And([Term("text", "bravo"), Term("text", "alfa")])
        m = aq.matcher(s)
        while m.is_active():
            orig = s.stored_fields(m.id())["text"]
            for span in m.spans():
                v = orig[span.start]
                assert v == "bravo" or v == "alfa"
            m.next()
Example #3
0
def test_simplify():
    s = fields.Schema(k=fields.ID, v=fields.TEXT)
    ix = RamStorage().create_index(s)

    w = ix.writer()
    w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee"))
    w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia"))
    w.commit()

    r = ix.reader()
    q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")])
    q2 = And([Or([Term('v', u('bear'), boost=2.0), Term('v', u('bee'), boost=2.0),
                  Term('v', u('brie'), boost=2.0)]), Term('v', 'juliet')])
    assert_equal(q1.simplify(r), q2)
Example #4
0
def test_accept():
    def boost_phrases(q):
        if isinstance(q, Phrase):
            q.boost *= 2.0
        return q

    before = And([Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")])]),
                  Phrase("a", [u("g"), u("h")], boost=0.25)])
    after = before.accept(boost_phrases)
    assert_equal(after, And([Term("a", u("b")),
                             Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")], boost=2.0)]),
                             Phrase("a", [u("g"), u("h")], boost=0.5)]))

    before = Phrase("a", [u("b"), u("c")], boost=2.5)
    after = before.accept(boost_phrases)
    assert_equal(after, Phrase("a", [u("b"), u("c")], boost=5.0))
Example #5
0
def test_duplicates():
    q = And([Term("a", u("b")), Term("a", u("b"))])
    assert_equal(q.normalize(), Term("a", u("b")))

    q = And([Prefix("a", u("b")), Prefix("a", u("b"))])
    assert_equal(q.normalize(), Prefix("a", u("b")))

    q = And([Variations("a", u("b")), And([Variations("a", u("b")), Term("a", u("b"))])])
    assert_equal(q.normalize(), And([Variations("a", u("b")), Term("a", u("b"))]))

    q = And([Term("a", u("b")), Prefix("a", u("b")), Term("a", u("b"), boost=1.1)])
    assert_equal(q.normalize(), q)

    # Wildcard without * or ? normalizes to Term
    q = And([Wildcard("a", u("b")), And([Wildcard("a", u("b")), Term("a", u("b"))])])
    assert_equal(q.normalize(), Term("a", u("b")))
Example #6
0
def atom(item_name):
    # Currently atom feeds behave in the fol. way
    # - Text diffs are shown in a side-by-side fashion
    # - The current binary item is fully rendered in the feed
    # - Image(binary)'s diff is shown using PIL
    # - First item is always rendered fully
    # - Revision meta(id, size and comment) is shown for parent and current revision
    query = Term(WIKINAME, app.cfg.interwikiname)
    if item_name:
        query = And([
            query,
            Term(NAME_EXACT, item_name),
        ])
    revs = list(
        flaskg.storage.search(query,
                              idx_name=LATEST_REVS,
                              sortedby=[MTIME],
                              reverse=True,
                              limit=1))
    if revs:
        rev = revs[0]
        cid = cache_key(usage="atom", revid=rev.revid, item_name=item_name)
        content = app.cache.get(cid)
    else:
        content = None
        cid = None
    if content is None:
        if not item_name:
            title = "{0}".format(app.cfg.sitename)
        else:
            title = "{0} - {1}".format(app.cfg.sitename, item_name)
        feed = AtomFeed(title=title,
                        feed_url=request.url,
                        url=request.host_url)
        query = Term(WIKINAME, app.cfg.interwikiname)
        if item_name:
            query = And([
                query,
                Term(NAME_EXACT, item_name),
            ])
        history = flaskg.storage.search(query,
                                        idx_name=ALL_REVS,
                                        sortedby=[MTIME],
                                        reverse=True,
                                        limit=100)
        for rev in history:
            name = rev.name
            item = rev.item
            this_revid = rev.meta[REVID]
            previous_revid = rev.meta.get(PARENTID)
            this_rev = rev
            try:
                hl_item = Item.create(name, rev_id=this_revid)
                if previous_revid is not None:
                    # HTML diff for subsequent revisions
                    previous_rev = item[previous_revid]
                    content = hl_item.content._render_data_diff_atom(
                        previous_rev, this_rev)
                else:
                    # full html rendering for new items
                    content = render_template(
                        'atom.html',
                        get='first_revision',
                        rev=this_rev,
                        content=Markup(hl_item.content._render_data()),
                        revision=this_revid)
                content_type = 'html'
            except Exception as e:
                logging.exception("content rendering crashed")
                content = _('MoinMoin feels unhappy.')
                content_type = 'text'
            author = get_editor_info(rev.meta, external=True)
            rev_comment = rev.meta.get(COMMENT, '')
            if rev_comment:
                # Trim down extremely long revision comment
                if len(rev_comment) > 80:
                    content = render_template('atom.html',
                                              get='comment_cont_merge',
                                              comment=rev_comment[79:],
                                              content=Markup(content))
                    rev_comment = "{0}...".format(rev_comment[:79])
                feed_title = "{0} - {1}".format(author.get(NAME, ''),
                                                rev_comment)
            else:
                feed_title = "{0}".format(author.get(NAME, ''))
            if not item_name:
                feed_title = "{0} - {1}".format(name, feed_title)
            feed.add(
                title=feed_title,
                title_type='text',
                summary=content,
                summary_type=content_type,
                author=author,
                url=url_for_item(name, rev=this_revid, _external=True),
                updated=datetime.fromtimestamp(rev.meta[MTIME]),
            )
        content = feed.to_string()
        # Hack to add XSLT stylesheet declaration since AtomFeed doesn't allow this
        content = content.split("\n")
        content.insert(1, render_template('atom.html', get='xml'))
        content = "\n".join(content)
        if cid is not None:
            app.cache.set(cid, content)
    return Response(content, content_type='application/atom+xml')
Example #7
0
    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, spelling_query=None, within=None,
               dwithin=None, distance_point=None, models=None,
               limit_to_registered_models=None, result_class=None, **kwargs):
        if not self.setup_complete:
            self.setup()

        # A zero length query should return no results.
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        query_string = force_text(query_string)

        # A one-character query (non-wildcard) gets nabbed by a stopwords
        # filter and should yield zero results.
        if len(query_string) <= 1 and query_string != u'*':
            return {
                'results': [],
                'hits': 0,
            }

        reverse = False

        if sort_by is not None:
            # Determine if we need to reverse the results and if Whoosh can
            # handle what it's being asked to sort by. Reversing is an
            # all-or-nothing action, unfortunately.
            sort_by_list = []
            reverse_counter = 0

            for order_by in sort_by:
                if order_by.startswith('-'):
                    reverse_counter += 1

            if reverse_counter and reverse_counter != len(sort_by):
                raise SearchBackendError("Whoosh requires all order_by fields"
                                         " to use the same sort direction")

            for order_by in sort_by:
                if order_by.startswith('-'):
                    sort_by_list.append(order_by[1:])

                    if len(sort_by_list) == 1:
                        reverse = True
                else:
                    sort_by_list.append(order_by)

                    if len(sort_by_list) == 1:
                        reverse = False

            sort_by = sort_by_list[0]

        if facets is not None:
            warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)

        if date_facets is not None:
            warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)

        if query_facets is not None:
            warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)

        narrowed_results = None
        self.index = self.index.refresh()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        narrow_searcher = None

        if narrow_queries is not None:
            # Potentially expensive? I don't see another way to do it in Whoosh...
            narrow_searcher = self.index.searcher()

            for nq in narrow_queries:
                recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
                                                                 limit=None)

                if len(recent_narrowed_results) <= 0:
                    return {
                        'results': [],
                        'hits': 0,
                    }

                if narrowed_results:
                    narrowed_results.filter(recent_narrowed_results)
                else:
                    narrowed_results = recent_narrowed_results

        self.index = self.index.refresh()

        if self.index.doc_count():
            parsed_query = self.parser.parse(query_string)
            if len(model_choices) > 0:
                narrow_model = [Term(DJANGO_CT, rm) for rm in model_choices]
                parsed_query = And([Or(narrow_model), parsed_query])

            searcher = self.index.searcher()

            # In the event of an invalid/stopworded query, recover gracefully.
            if parsed_query is None:
                return {
                    'results': [],
                    'hits': 0,
                }

            page_num, page_length = self.calculate_page(start_offset, end_offset)

            collapse_field = kwargs.get("collapse")
            collapse_limit = kwargs.get("collapse_limit")

            search_kwargs = {
                'pagelen': page_length,
                'sortedby': sort_by,
                'reverse': reverse
            }

            if collapse_field is not None:
                search_kwargs['collapse'] = FieldFacet(collapse_field)
                search_kwargs['collapse_limit'] = 1

                if kwargs.get("collapse_order") is not None:
                    order = kwargs.get("collapse_order")
                    collapse_order = FieldFacet(order.replace('-', ''), reverse=order.find('-') > -1)
                    search_kwargs['collapse_order'] = collapse_order

            # Handle the case where the results have been narrowed.
            if narrowed_results is not None:
                search_kwargs['filter'] = narrowed_results

            try:
                raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs)
            except ValueError:
                if not self.silently_fail:
                    raise

                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }

            # Because as of Whoosh 2.5.1, it will return the wrong page of
            # results if you request something too high. :(
            grouped_results = None
            if raw_page.pagenum < page_num:
                return {
                    'results': [],
                    'hits': 0,
                    'spelling_suggestion': None,
                }
            if collapse_field is not None and collapse_limit > 1:
                search_kwargs = {
                    'sortedby': collapse_order
                }
                grouped_results = []
                for result in raw_page:
                    query = And([Term(collapse_field, result[collapse_field]), parsed_query])
                    results = searcher.search(query, limit=collapse_limit, **search_kwargs)

                    grouped_results.append(results)

            results = self._process_results(raw_page, result_class=result_class, collapse_field=collapse_field, grouped_results=grouped_results)
            searcher.close()

            if hasattr(narrow_searcher, 'close'):
                narrow_searcher.close()

            return results
        else:
            if self.include_spelling:
                if spelling_query:
                    spelling_suggestion = self.create_spelling_suggestion(spelling_query)
                else:
                    spelling_suggestion = self.create_spelling_suggestion(query_string)
            else:
                spelling_suggestion = None

            return {
                'results': [],
                'hits': 0,
                'spelling_suggestion': spelling_suggestion,
            }
Example #8
0
def search(request):
    if request.method == 'POST':
        form = Search_Form(request.POST)
        if form.is_valid():
            if not aux_check_index():
                aux_reset_all()
            key = form.cleaned_data['key_word'].lower()
            type = form.cleaned_data['type']
            ix = open_dir(dirindex)
            with ix.searcher() as searcher:
                words = key.strip().split()
                terms_classified = []
                for word in words:
                    terms = []
                    for desc in [
                            'descripcionECI', 'descripcionMM', 'descripcionFC'
                    ]:
                        terms.append(Term(desc, word))
                    terms_classified.append(terms)
                subqueries = []
                for t in terms_classified:
                    if type == 'N3':
                        subqueries.append(And(t))
                    else:
                        subqueries.append(Or(t))
                query = subqueries[0]
                if len(subqueries) > 1:
                    if type == 'N1':
                        query = Or(subqueries)
                    else:
                        query = And(subqueries)
                results = searcher.search(query)
                title = "Resultados para: "
                mostrar = True
                if len(results) == 0:
                    title = "No hay resultados para: "
                    mostrar = False
                eci = []
                mm = []
                fc = []
                for r in results:
                    eci.append(
                        Historico_ECI.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                    mm.append(
                        Historico_MM.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                    fc.append(
                        Historico_FC.objects.filter(
                            producto_id=r['ean']).order_by("-fecha")[0])
                return render(
                    request, 'search.html', {
                        "eci": eci,
                        "mm": mm,
                        'fc': fc,
                        "title": title + key,
                        "mostrar": mostrar
                    })
    else:
        form = Search_Form()
    return render(request, 'search.html', {'form': form})
def test_query_copy_hash():
    def do(q1, q2):
        q1a = copy.deepcopy(q1)
        assert q1 == q1a
        assert hash(q1) == hash(q1a)
        assert q1 != q2

    do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5))
    do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1),
       And([Term("a", u("b")), Term("c", u("d"))], boost=1.5))
    do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]),
       Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5))
    do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]),
       DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))],
                      boost=1.5))
    do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5)))
    do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5))
    do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"),
                                                     boost=1.5))
    do(FuzzyTerm("a", u("b"), constantscore=True),
       FuzzyTerm("a", u("b"), constantscore=False))
    do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5))
    do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d")))
    do(TermRange("a", None, u("c")), TermRange("a", None, None))
    do(TermRange("a", u("b"), u("c"), boost=1.1),
       TermRange("a", u("b"), u("c"), boost=1.5))
    do(TermRange("a", u("b"), u("c"), constantscore=True),
       TermRange("a", u("b"), u("c"), constantscore=False))
    do(NumericRange("a", 1, 5), NumericRange("a", 1, 6))
    do(NumericRange("a", None, 5), NumericRange("a", None, None))
    do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5))
    do(NumericRange("a", 3, 6, constantscore=True),
       NumericRange("a", 3, 6, constantscore=False))
    # do(DateRange)
    do(Variations("a", u("render")), Variations("a", u("renders")))
    do(Variations("a", u("render"), boost=1.1),
       Variations("a", u("renders"), boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")]),
       Phrase("a", [u("b"), u("c"), u("e")]))
    do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1),
       Phrase("a", [u("b"), u("c"), u("d")], boost=1.5))
    do(Phrase("a", [u("b"), u("c"), u("d")], slop=1),
       Phrase("a", [u("b"), u("c"), u("d")], slop=2))
    # do(Ordered)
    do(Every(), Every("a"))
    do(Every("a"), Every("b"))
    do(Every("a", boost=1.1), Every("a", boost=1.5))
    do(NullQuery, Term("a", u("b")))
    do(ConstantScoreQuery(Term("a", u("b"))),
       ConstantScoreQuery(Term("a", u("c"))))
    do(ConstantScoreQuery(Term("a", u("b")), score=2.0),
       ConstantScoreQuery(Term("a", u("c")), score=2.1))
    do(Require(Term("a", u("b")), Term("c", u("d"))),
       Require(Term("a", u("b"), boost=1.1), Term("c", u("d"))))
    # do(Require)
    # do(AndMaybe)
    # do(AndNot)
    # do(Otherwise)

    do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")),
                                                        limit=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d"))),
       SpanNear(Term("a", u("b")), Term("c", u("e"))))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1),
       SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2))
    do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True),
       SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False))
    do(SpanNot(Term("a", u("b")), Term("a", u("c"))),
       SpanNot(Term("a", u("b")), Term("a", u("d"))))
    do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]),
       SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))]))
    do(SpanContains(Term("a", u("b")), Term("a", u("c"))),
       SpanContains(Term("a", u("b")), Term("a", u("d"))))
Example #10
0
				if ored_values_match!=None:
					ored_values = ored_values_match.group(0)
					values = [v for v in pattern_val.findall(ored_values) if v!='OR']
					values = [v[1:-1] for v in values if (v[0]=='"' and v[-1]=='"')]
					expr = Or([Term(key, value) for value in values])
				else:
					value = pattern_val.match(kvp[voffset:]).group(0)
					#print key, value
					if value[0]=='"' and value[-1]=='"':
						value = value[1:-1]
					expr = Term(key,value)
		except Exception, e:
			sys.stderr.write('Error parsing %s, Exception: %s'%(s,str(e)))
			raise Exception('Error Parsing FQ')
		if expr!=None: exprs.append(expr)
	return And(exprs) if len(exprs)>0 else None

def to_solr_format(res, start):
	solr_res={'responseHeader':{}}
	solr_res['responseHeader'].update({'QTime':0.0})
	solr_res.update({'response':{}})
	solr_res['response'].update({'numFound':0})
	solr_res['response'].update({'start':start})
	solr_res['response'].update({'maxScore':0.0})
	solr_res['response'].update({'docs':[]})
	for r in res:
		solr_res['response']['docs'].append({
			'url':r['url'],
			'product_title':r['title'],
			'raw_catpred':r['catpred'],
			'sku':r['sku'],
Example #11
0
    def run(self,
            directory='HTML',
            theme='topside_cms',
            exclude_ns='userprofiles',
            user=None,
            query=None):
        if theme:
            app.cfg.user_defaults[THEME_NAME] = theme
        exclude_ns = exclude_ns.split(',') if exclude_ns else []

        before_wiki()

        norm = os.path.normpath
        join = os.path.join

        if '/' in directory:
            # user has specified complete path to root
            html_root = directory
        else:
            html_root = norm(join(app.cfg.wikiconfig_dir, directory))
        repo_root = norm(join(app.cfg.wikiconfig_dir))
        moinmoin = norm(join(app.cfg.wikiconfig_dir, 'src', 'moin'))

        # override ACLs with permission to read all items
        for namespace, acls in app.cfg.acl_mapping:
            acls['before'] = 'All:read'

        # create an empty output directory after deleting any existing directory
        print u'Creating output directory {0}, starting to copy supporting files'.format(
            html_root)
        if os.path.exists(html_root):
            shutil.rmtree(html_root, ignore_errors=False)
        else:
            os.makedirs(html_root)

        # create subdirectories and copy static css, icons, images into "static" subdirectory
        shutil.copytree(norm(join(moinmoin, 'static')),
                        norm(join(html_root, 'static')))
        shutil.copytree(norm(join(repo_root, 'wiki_local')),
                        norm(join(html_root, '+serve/wiki_local')))

        # copy files from xstatic packaging into "+serve" subdirectory
        pkg = app.cfg.pkg
        xstatic_dirs = [
            'font_awesome', 'jquery', 'jquery_tablesorter', 'autosize'
        ]
        if theme in [
                'basic',
        ]:
            xstatic_dirs.append('bootstrap')
        for dirs in xstatic_dirs:
            xs = XStatic(getattr(pkg, dirs),
                         root_url='/static',
                         provider='local',
                         protocol='http')
            shutil.copytree(xs.base_dir, norm(join(html_root, '+serve', dirs)))

        # copy directories for theme's static files
        theme = app.cfg.user_defaults[THEME_NAME]
        if theme == 'topside_cms':
            # topside_cms uses topside CSS files
            from_dir = norm(join(moinmoin, 'themes/topside/static'))
        else:
            from_dir = norm(join(moinmoin, 'themes', theme, 'static'))
        to_dir = norm(join(html_root, '_themes', theme))
        shutil.copytree(from_dir, to_dir)

        # convert: <img alt="svg" src="/+get/+7cb364b8ca5d4b7e960a4927c99a2912/svg" />
        # to:      <img alt="svg" src="+get/svg" />
        invalid_src = re.compile(r' src="/\+get/\+[0-9a-f]{32}/')
        valid_src = u' src="+get/'

        # get ready to render and copy individual items
        names = []
        home_page = None
        get_dir = norm(join(
            html_root, '+get'))  # images and other raw data from wiki content
        os.makedirs(get_dir)

        if query:
            q = And([
                Term(WIKINAME, app.cfg.interwikiname),
                Regex(NAME_EXACT, query)
            ])
        else:
            q = Every()

        print 'Starting to dump items'
        for current_rev in app.storage.search(q, limit=None, sortedby="name"):
            if current_rev.namespace in exclude_ns:
                # we usually do not copy userprofiles, no one can login to a static wiki
                continue
            if not current_rev.name:
                # TODO: we skip nameless tickets, but named tickets and comments are processed with ugly names
                continue

            try:
                item_name = current_rev.fqname.fullname
                rendered = show_item(
                    item_name, CURRENT)  # @@@  userid is needed for acls here
                # convert / characters in sub-items and namespaces and save names for index
                file_name = item_name.replace('/', SLASH)
                filename = norm(join(html_root, file_name))
                names.append(file_name)
            except Forbidden:
                print u'Failed to dump {0}: Forbidden'.format(current_rev.name)
                continue
            except KeyError:
                print u'Failed to dump {0}: KeyError'.format(current_rev.name)
                continue

            if not isinstance(rendered, unicode):
                print u'Rendering failed for {0} with response {1}'.format(
                    file_name, rendered)
                continue
            # make hrefs relative to current folder
            rendered = rendered.replace('href="/', 'href="')
            rendered = rendered.replace('src="/static/', 'src="static/')
            rendered = rendered.replace('src="/+serve/', 'src="+serve/')
            rendered = rendered.replace(
                'href="+index/"',
                'href="+index"')  # trailing slash changes relative position
            rendered = rendered.replace(
                '<a href="">', u'<a href="{0}">'.format(
                    app.cfg.default_root))  # TODO: fix basic theme
            # remove item ID from: src="/+get/+7cb364b8ca5d4b7e960a4927c99a2912/svg"
            rendered = re.sub(invalid_src, valid_src, rendered)
            rendered = self.subitems(rendered)

            # copy raw data for all items to output /+get directory; images are required, text items are of marginal/no benefit
            item = app.storage[current_rev.name]
            rev = item[CURRENT]
            with open(get_dir + '/' + file_name, 'wb') as f:
                shutil.copyfileobj(rev.data, f)

            # save rendered items or raw data to dump directory root
            contenttype = item.meta['contenttype'].split(';')[0]
            if contenttype in CONTENTTYPE_MEDIA and filename.endswith(
                    CONTENTTYPE_MEDIA_SUFFIX):
                # do not put a rendered html-formatted file with a name like video.mp4 into root; browsers want raw data
                with open(filename, 'wb') as f:
                    rev.data.seek(0)
                    shutil.copyfileobj(rev.data, f)
                    print u'Saved file named {0} as raw data'.format(
                        filename).encode('utf-8')
            else:
                with open(filename, 'wb') as f:
                    f.write(rendered.encode('utf8'))
                    print u'Saved file named {0}'.format(filename).encode(
                        'utf-8')

            if current_rev.name == app.cfg.default_root:
                # make duplicates of home page that are easy to find in directory list and open with a click
                for target in [(current_rev.name + '.html'),
                               ('_' + current_rev.name + '.html')]:
                    with open(norm(join(html_root, target)), 'wb') as f:
                        f.write(rendered.encode('utf8'))
                home_page = rendered  # save a copy for creation of index page

        if home_page:
            # create an index page by replacing the content of the home page with a list of items
            # work around differences in basic and modernized theme layout
            # TODO: this is likely to break as new themes are added
            if theme == 'basic':
                start = '<div class="moin-content" role="main">'  # basic
                end = '<footer class="navbar moin-footer">'
                div_end = '</div>'
            else:
                start = '<div id="moin-content">'  # modernized , topside, topside cms
                end = '<footer id="moin-footer">'
                div_end = '</div></div>'
            # build a page named "+index" containing links to all wiki items
            ul = u'<h1>Index</h1><ul>{0}</ul>'
            li = u'<li><a href="{0}">{1}</a></li>'
            links = []
            names.sort()
            for name in names:
                links.append(li.format(name, name.replace(SLASH, '/')))
            name_links = ul.format(u'\n'.join(links))
            try:
                part1 = home_page.split(start)[0]
                part2 = home_page.split(end)[1]
                page = part1 + start + name_links + div_end + end + part2
            except IndexError:
                page = home_page
                print u'Error: failed to find {0} in item named {1}'.format(
                    end, app.cfg.default_root)
            for target in ['+index', '_+index.html']:
                with open(norm(join(html_root, target)), 'wb') as f:
                    f.write(page.encode('utf8'))
        else:
            print 'Error: no item matching name in app.cfg.default_root was found'
Example #12
0
 def __init__(self, a, b):
     self.a = a
     self.b = b
     self.q = And([a, b])
def test_random_intersections():
    domain = [
        u("alpha"),
        u("bravo"),
        u("charlie"),
        u("delta"),
        u("echo"),
        u("foxtrot"),
        u("golf"),
        u("hotel"),
        u("india"),
        u("juliet"),
        u("kilo"),
        u("lima"),
        u("mike"),
    ]
    segments = 5
    docsperseg = 50
    fieldlimits = (3, 10)
    documents = []

    schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    # Create docsperseg * segments documents containing random words from
    # the domain list. Add the documents to the index, but also keep them
    # in the "documents" list for the sanity check
    for i in xrange(segments):
        w = ix.writer()
        for j in xrange(docsperseg):
            docnum = i * docsperseg + j
            # Create a string of random words
            doc = u(" ").join(choice(domain) for _ in xrange(randint(*fieldlimits)))
            # Add the string to the index
            w.add_document(key=docnum, value=doc)
            # Add a (docnum, string) tuple to the documents list
            documents.append((docnum, doc))
        w.commit()
    assert_not_equal(len(ix._segments()), 1)

    testcount = 20
    testlimits = (2, 5)

    with ix.searcher() as s:
        for i in xrange(s.doc_count_all()):
            assert_not_equal(s.stored_fields(i).get("key"), None)

        for _ in xrange(testcount):
            # Create a random list of words and manually do an intersection of
            # items in "documents" that contain the words ("target").
            words = sample(domain, randint(*testlimits))
            target = []
            for docnum, doc in documents:
                if all((doc.find(w) > -1) for w in words):
                    target.append(docnum)
            target.sort()

            # Create a query from the list of words and get two matchers from
            # it.
            q = And([Term("value", w) for w in words])
            m1 = q.matcher(s)
            m2 = q.matcher(s)

            # Try getting the list of IDs from all_ids()
            ids1 = list(m1.all_ids())

            # Try getting the list of IDs using id()/next()
            ids2 = []
            while m2.is_active():
                ids2.append(m2.id())
                m2.next()

            # Check that the two methods return the same list
            assert_equal(ids1, ids2)

            # Check that the IDs match the ones we manually calculated
            assert_equal(_keys(s, ids1), target)
Example #14
0
    def recurse(self, elem, page_href):
        # on first call, elem.tag.name=='page'.
        # Descendants (body, div, p, include, page, etc.) are processed by recursing through DOM

        # stack is used to detect transclusion loops
        page_href_new = elem.get(moin_page.page_href)
        if page_href_new:
            page_href_new = Iri(page_href_new)
            if page_href_new != page_href:
                page_href = page_href_new
                self.stack.append(page_href)
            else:
                self.stack.append(None)
        else:
            self.stack.append(None)

        try:
            if elem.tag == xinclude.include:
                # we have already recursed several levels and found a transclusion: "{{SomePage}}" or <<Include(...)>>
                # process the transclusion and add it to the DOM.  Subsequent recursions will traverse through
                # the transclusion's elements.
                href = elem.get(xinclude.href)
                xpointer = elem.get(xinclude.xpointer)

                xp_include_pages = None
                xp_include_sort = None
                xp_include_items = None
                xp_include_skipitems = None
                xp_include_heading = None
                xp_include_level = None

                if xpointer:
                    # we are working on an <<Include(abc)>> macro, not a {{transclusion}}
                    xp = XPointer(xpointer)
                    xp_include = None
                    xp_namespaces = {}
                    for entry in xp:
                        uri = None
                        name = entry.name.split(':', 1)
                        if len(name) > 1:
                            prefix, name = name
                            uri = xp_namespaces.get(prefix, False)
                        else:
                            name = name[0]

                        if uri is None and name == 'xmlns':
                            d_prefix, d_uri = entry.data.split('=', 1)
                            xp_namespaces[d_prefix] = d_uri
                        elif uri == moin_page.namespace and name == 'include':
                            xp_include = XPointer(entry.data)

                    if xp_include:
                        for entry in xp_include:
                            name, data = entry.name, entry.data_unescape
                            # TODO: These do not include all parameters in moin 1.9 Include macro docs:
                            # <<Include(pagename, heading, level, from="regex", to="regex", sort=ascending|descending, items=n, skipitems=n, titlesonly, editlink)>>
                            # these are currently unsupported in moin 2.0: from, to, titlesonly, editlink
                            if name == 'pages':  # pages == pagename in moin 1.9
                                xp_include_pages = data
                            elif name == 'sort':
                                xp_include_sort = data
                            elif name == 'items':
                                xp_include_items = int(data)
                            elif name == 'skipitems':
                                xp_include_skipitems = int(data)
                            elif name == 'heading':
                                xp_include_heading = data
                            elif name == 'level':
                                xp_include_level = data

                included_elements = []
                if href:
                    # We have a single page to transclude or include
                    href = Iri(href)
                    link = Iri(scheme='wiki', authority='')
                    if href.scheme == 'wiki':
                        if href.authority:
                            raise ValueError(
                                "can't handle xinclude for non-local authority"
                            )
                        else:
                            path = href.path[1:]
                    elif href.scheme == 'wiki.local':
                        page = page_href
                        path = href.path
                        if path[0] == '':
                            # /subitem
                            tmp = page.path[1:]
                            tmp.extend(path[1:])
                            path = tmp
                        elif path[0] == '..':
                            # ../sisteritem
                            path = page.path[1:] + path[1:]
                    else:
                        raise ValueError(
                            "can't handle xinclude for schemes other than wiki or wiki.local"
                        )

                    link.path = path

                    if flaskg.user.may.read(unicode(path)):
                        page = Item.create(unicode(path))
                        pages = ((page, link), )
                    else:
                        # ACLs prevent user from viewing a transclusion - show message
                        message = moin_page.p(children=(_(
                            'Access Denied, transcluded content suppressed.')))
                        attrib = {html.class_: 'warning'}
                        div = ET.Element(moin_page.div,
                                         attrib,
                                         children=(message, ))
                        container = ET.Element(moin_page.body,
                                               children=(div, ))
                        return [
                            container, 0
                        ]  # replace transclusion with container's child

                elif xp_include_pages:
                    # we have regex of pages to include:  <<Include(^qqq)>>
                    query = And([
                        Term(WIKINAME, app.cfg.interwikiname),
                        Regex(NAME_EXACT, xp_include_pages)
                    ])
                    reverse = xp_include_sort == 'descending'
                    results = flaskg.storage.search(query,
                                                    sortedby=NAME_EXACT,
                                                    reverse=reverse,
                                                    limit=None)
                    pagelist = [result.name for result in results]
                    if xp_include_skipitems is not None:
                        pagelist = pagelist[xp_include_skipitems:]
                    if xp_include_items is not None:
                        pagelist = pagelist[xp_include_items + 1:]
                    pages = ((Item.create(p),
                              Iri(scheme='wiki', authority='', path='/' + p))
                             for p in pagelist)
                    if not pagelist:
                        msg = _(
                            'Error: no items found matching "<<Include({0})>>"'
                        ).format(xp_include_pages)
                        attrib = {html.class_: 'moin-error'}
                        strong = ET.Element(moin_page.strong, attrib, (msg, ))
                        included_elements.append(strong)

                for page, p_href in pages:
                    if p_href.path[0] != '/':
                        p_href.path = IriPath('/' + '/'.join(p_href.path))
                    if p_href in self.stack:
                        # we have a transclusion loop, create an error message showing list of pages forming loop
                        loop = self.stack[self.stack.index(p_href):]
                        loop = [
                            u'{0}'.format(ref.path[1:])
                            for ref in loop if ref is not None
                        ] + [page.name]
                        msg = u'Error: Transclusion loop via: ' + u', '.join(
                            loop)
                        attrib = {html.class_: 'moin-error'}
                        strong = ET.Element(moin_page.strong, attrib, (msg, ))
                        included_elements.append(strong)
                        continue

                    if xp_include_heading is not None:
                        attrib = {xlink.href: p_href}
                        children = (xp_include_heading or page.name, )
                        elem_a = ET.Element(moin_page.a,
                                            attrib,
                                            children=children)
                        attrib = {
                            moin_page.outline_level: xp_include_level or '1'
                        }
                        elem_h = ET.Element(moin_page.h,
                                            attrib,
                                            children=(elem_a, ))
                        included_elements.append(elem_h)

                    page_doc = page.content.internal_representation(
                        attributes=Arguments(keyword=elem.attrib))
                    if isinstance(page.rev.data, file):
                        page.rev.data.close()

                    self.recurse(page_doc, page_href)

                    # The href needs to be an absolute URI, without the prefix "wiki://"
                    page_doc = mark_item_as_transclusion(page_doc, p_href.path)
                    included_elements.append(page_doc)

                if len(included_elements) > 1:
                    # use a div as container
                    result = ET.Element(moin_page.div)
                    result.extend(included_elements)
                elif included_elements:
                    result = included_elements[0]
                else:
                    result = None
                #  end of processing for transclusion; the "result" will get inserted into the DOM below
                return result

            # Traverse the DOM by calling self.recurse with each child of the current elem.
            # Starting elem.tag.name=='page'.
            container = []
            i = 0
            while i < len(elem):
                child = elem[i]
                if isinstance(child, ET.Node):

                    ret = self.recurse(child, page_href)

                    if ret:
                        # Either child or a descendant of child is a transclusion.
                        # See top of this script for notes on why these DOM adjustments are required.
                        if isinstance(ret, ET.Node
                                      ) and elem.tag.name in NO_BLOCK_CHILDREN:
                            body = ret[0]
                            if len(body) == 0:
                                # the transcluded item is empty, insert an empty span into DOM
                                attrib = Attributes(ret).convert()
                                elem[i] = ET.Element(moin_page.span,
                                                     attrib=attrib)
                            elif (isinstance(body[0], ET.Node)
                                  and (len(body) > 1 or body[0].tag.name
                                       not in ('p', 'object', 'a'))):
                                # Complex case: "some text {{BlockItem}} more text" or "\n{{BlockItem}}\n" where
                                # the BlockItem body contains multiple p's, a table, preformatted text, etc.
                                # These block elements cannot be made a child of the current elem, so we create
                                # a container to replace elem.
                                # Create nodes to hold any siblings before and after current child (elem[i])
                                before = copy.deepcopy(elem)
                                after = copy.deepcopy(elem)
                                before[:] = elem[0:i]
                                after[:] = elem[i + 1:]
                                if len(before):
                                    # there are siblings before transclude, save them in container
                                    container.append(before)
                                new_trans_ptr = len(container)
                                # get attributes from page node;
                                # we expect {class: "moin-transclusion"; data-href: "http://some.org/somepage"}
                                attrib = Attributes(ret).convert()
                                # current elem will likely be replaced by container so we need to copy data-lineno attr
                                if html.data_lineno in elem.attrib:
                                    attrib[html.data_lineno] = elem.attrib[
                                        html.data_lineno]
                                # make new div node to hold transclusion, copy children, and save in container
                                div = ET.Element(moin_page.div,
                                                 attrib=attrib,
                                                 children=body[:])
                                container.append(
                                    div)  # new_trans_ptr is index to this
                                if len(after):
                                    container.append(after)
                                if elem.tag.name == 'a':
                                    # invalid input [[MyPage|{{BlockItem}}]],
                                    # best option is to retain A-tag and fail html validation
                                    # TODO: error may not be obvious to user - add error message
                                    elem[i] = div
                                else:
                                    # move up 1 level in recursion where elem becomes the child and
                                    # is usually replaced by container
                                    return [container, new_trans_ptr]
                            else:
                                # default action for inline transclusions or odd things like circular transclusion error messages
                                classes = child.attrib.get(html.class_,
                                                           '').split()
                                classes += ret.attrib.get(html.class_,
                                                          '').split()
                                ret.attrib[html.class_] = ' '.join(classes)
                                elem[i] = ret
                        elif isinstance(ret, types.ListType):
                            # a container has been returned.
                            # Note: there are multiple places where a container may be constructed
                            ret_container, trans_ptr = ret
                            # trans_ptr points to the transclusion within ret_container.
                            # Here the transclusion will always contain a block level element
                            if elem.tag.name in NO_BLOCK_CHILDREN:
                                # Complex case, transclusion effects grand-parent, great-grand-parent, e.g.:
                                # "/* comment {{BlockItem}} */" or  "text ''italic {{BlockItem}} italic'' text"
                                # elem is an inline element, build a bigger container to replace elem's parent,
                                before = copy.deepcopy(elem)
                                after = copy.deepcopy(elem)
                                before[:] = elem[0:i] + ret_container[
                                    0:trans_ptr]
                                after[:] = ret_container[trans_ptr +
                                                         1:] + elem[i + 1:]
                                if len(before):
                                    container.append(before)
                                new_trans_ptr = len(container)
                                # child may have classes like "comment" that must be added to transcluded element
                                classes = child.attrib.get(
                                    moin_page.class_, '').split()
                                # must use moin_page.class_ above, but use html.class below per html_out.py code
                                classes += ret_container[trans_ptr].attrib.get(
                                    html.class_, '').split()
                                ret_container[trans_ptr].attrib[
                                    html.class_] = ' '.join(classes)
                                container.append(ret_container[trans_ptr]
                                                 )  # the transclusion
                                if len(after):
                                    container.append(after)
                                return [container, new_trans_ptr]
                            else:
                                # elem is a block element
                                for grandchild in child:
                                    if isinstance(
                                            grandchild, ET.Node
                                    ) and grandchild.tag.name == u'include':
                                        # the include may have classes that must be added to transcluded element
                                        classes = grandchild.attrib.get(
                                            html.class_, '').split()
                                        classes += ret_container[
                                            trans_ptr].attrib.get(
                                                html.class_, '').split()
                                        ret_container[trans_ptr].attrib[
                                            html.class_] = ' '.join(classes)
                                # replace child element with the container generated in lower recursion
                                elem[i:i +
                                     1] = ret_container  # elem[i] is the child
                        else:
                            # default action for any ret not fitting special cases above,
                            # e.g. tranclusion is within a table cell
                            elem[i] = ret
                # we are finished with this child, advance to next sibling
                i += 1

        finally:
            self.stack.pop()
def test_random_intersections():
    domain = [u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"),
              u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"),
              u("kilo"), u("lima"), u("mike")]
    segments = 5
    docsperseg = 50
    fieldlimits = (3, 10)
    documents = []

    schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    # Create docsperseg * segments documents containing random words from
    # the domain list. Add the documents to the index, but also keep them
    # in the "documents" list for the sanity check
    for i in xrange(segments):
        w = ix.writer()
        for j in xrange(docsperseg):
            docnum = i * docsperseg + j
            # Create a string of random words
            doc = u(" ").join(choice(domain)
                            for _ in xrange(randint(*fieldlimits)))
            # Add the string to the index
            w.add_document(key=docnum, value=doc)
            # Add a (docnum, string) tuple to the documents list
            documents.append((docnum, doc))
        w.commit()
    assert len(ix._segments()) != 1

    testcount = 20
    testlimits = (2, 5)

    with ix.searcher() as s:
        for i in xrange(s.doc_count_all()):
            assert s.stored_fields(i).get("key") is not None

        for _ in xrange(testcount):
            # Create a random list of words and manually do an intersection of
            # items in "documents" that contain the words ("target").
            words = sample(domain, randint(*testlimits))
            target = []
            for docnum, doc in documents:
                if all((doc.find(w) > -1) for w in words):
                    target.append(docnum)
            target.sort()

            # Create a query from the list of words and get two matchers from
            # it.
            q = And([Term("value", w) for w in words])
            m1 = q.matcher(s)
            m2 = q.matcher(s)

            # Try getting the list of IDs from all_ids()
            ids1 = list(m1.all_ids())

            # Try getting the list of IDs using id()/next()
            ids2 = []
            while m2.is_active():
                ids2.append(m2.id())
                m2.next()

            # Check that the two methods return the same list
            assert ids1 == ids2

            # Check that the IDs match the ones we manually calculated
            assert _keys(s, ids1) == target
Example #16
0
    def search(self, string, limit=25, facet=None, proxy=True,
               boosts=None, filter=None, mask=None):
        from ..database import get_activity

        lowercase = lambda x: x.lower() if hasattr(x, "lower") else x
        string = lowercase(string)

        fields = [
            "name",
            "comment",
            "product",
            "categories",
            "location",
        ]

        boosts = boosts or {
            "name": 5,
            "comment": 1,
            "product": 3,
            "categories": 2,
            "location": 3,
        }

        qp = MultifieldParser(
            fields,
            self.index.schema,
            fieldboosts=boosts
        )

        kwargs = {'limit': limit}
        if filter is not None:
            assert isinstance(filter, dict), "`filter` must be a dictionary"
            for k in filter:
                assert k in fields, "`filter` field {} not in search schema".format(k)
            if len(filter) == 1:
                kwargs["filter"] = [Term(k, lowercase(v)) for k, v in filter.items()][0]
            else:
                kwargs["filter"] = And([Term(k, lowercase(v)) for k, v in filter.items()])
        if mask is not None:
            assert isinstance(mask, dict), "`mask` must be a dictionary"
            for k in mask:
                assert k in fields, "`mask` field {} not in search schema".format(k)
            if len(mask) == 1:
                kwargs["mask"] = [Term(k, lowercase(v)) for k, v in mask.items()][0]
            else:
                kwargs["mask"] = And([Term(k, lowercase(v)) for k, v in mask.items()])

        with self.index.searcher() as searcher:
            if facet is None:
                results = searcher.search(qp.parse(string), **kwargs)
                if 'mask' in kwargs or 'filter' in kwargs:
                    print("Excluding {} filtered results".format(results.filtered_count))
                results = [dict(obj.items()) for obj in results]
            else:
                kwargs.pop('limit')
                results = {
                    k: [searcher.stored_fields(i) for i in v] for k, v in
                    searcher.search(
                        qp.parse(string),
                        groupedby=facet,
                        **kwargs
                    ).groups().items()}

        if proxy and facet is not None:
            return {key: [get_activity((obj['database'], obj['code'])) for obj in value]
                    for key, value in results.items()}
        elif proxy:
            return [get_activity((obj['database'], obj['code'])) for obj in results]
        else:
            return results
Example #17
0
def advanced_query(parameters, page = 0, n = 10):
    """
    :param dict text_paramters: a dictionary of field-to-query pairs specifying text-based queries.
        this is good for fields like "rules_text", "name", "flavor_text", etc.
    :param dict range_parameters: a dictionary of field to range pairs.
        this is good for fields like power, toughness, cmc, etc.
    :param dict point_parameters: a dictionary of field to value parameters. Every card in the return
        set must have an exact match to every value in the dict.
        In example, if point_parameters is {'cmc': 5} then for every card in the set card.cmc == 5 must evaluate to true.
        .. warning::
            using this parameter will cause the query system to filter through whoosh results, slowing down computation.
    :param int page: the 'page' of results to return
    :param int n: the number of results per page.
    :return: Exact class TBD, will provide way to iterate over the page's worth of results.
    """
    import whoosh.fields
    from whoosh.query import And, Or
    schema = get_whoosh_index().schema

    # fix `page` and `n` (they may be string versions of ints)
    page = int(page)
    n = int(n)

    # After talking with Ben it sounds like we can do something to the effect
    # of taking multiple sub queries and perform unions and intersections on their
    # results
    # This is going to be the best way to get the desired results.

    # to start: build a list of all the query objects we'll be searching.
    query_objs = []
    for field, target in parameters.items():
        # Coerce potential numeric point queries to whoosh syntax.
        if isinstance(target, float):
            target = int(target+0.5)
        if isinstance(target, int):
            target = str(target)
            #target = f"{{{target-1} TO {target+1}}}"
            #target = target.replace("[ TO", "[TO").replace("TO ]", "TO]")

        # Coerce range queries to whoosh syntax, assume they're inclusive bounds.
        if isinstance(target, (list, tuple)):
            if len(target) != 2:
                raise ValueError(f"Unable to treat parameter as range query! ({target})")
            target = f"[{target[0] if target[0] != -1 else ''} TO {target[1] if target[1] != -1 else ''}]"

            # whoosh has issues if there's an open ended range with a space separating TO from the bracket:
            target = target.replace("[ TO", "[TO").replace("TO ]", "TO]")

        # the comma-separated KEYWORD fields have been giving us some issue
        # it seems that whoosh is a bit bi-polar when it comes to commas in these fields.
        # so we'll add two subqueries, one with a comma and one without.
        if field in schema and isinstance(schema[field], whoosh.fields.KEYWORD):
            # add the extra query object:
            subqueries = [QueryParser(field, schema).parse(target.lower()+','),
                          QueryParser(field, schema).parse(target.lower())]
            query_objs.append(Or(subqueries))

        else:
            query_objs.append(QueryParser(field, schema).parse(target.lower())) # again, lower capitalization on everything

    if not len(query_objs):
        return []

    # now build a nice big compound query:
    query = And(query_objs)
    with get_whoosh_index().searcher() as searcher:
        # run that query and return the appropriate results page.
        try:
            results = searcher.search_page(query, page+1, n)
        except Exception:
            print(repr(query))
            raise

        return [x['data_obj'] for x in results]
Example #18
0
    def getAnnotations(self, commentText):
        tmpRes = {}

        if commentText == '':
            return []

        procCommText, origIndx = compressStr(commentText, lower=True)

        termArr = procCommText.split()

        # There might be repeating entries
        orgNames = set()

        for qs in range(0, len(termArr), self.queryStride):
            qe = min(qs + self.querySpan, len(termArr))

            q = []
            for i in range(qs, qe - 1):
                if not termArr[i] in STOP_WORDS:
                    bigram = And([
                        Term(self.fieldName, termArr[i]),
                        Term(self.fieldName, termArr[i + 1])
                    ])
                    q.append(bigram)

            #print('@@', ' '.join(termArr[qs:qe]))
            #print('Query: ', q)

            res = self.searcher.search(Or(q), limit=self.topK)

            #print('Found %d results' % len(res))

            for k in range(len(res)):
                if k >= self.topK:
                    break
                orgName = res[k][self.fieldName]
                orgNames.add(orgName)

        for orgName in orgNames:
            start = 0
            while start < len(procCommText):
                indx = procCommText.find(orgName, start)
                #print('###', orgName, start, indx)
                if indx == -1: break
                assert (indx + len(orgName) <= len(origIndx))
                start = indx + len(orgName)
                # To be a valid match
                startChar = origIndx[indx]
                endChar = origIndx[indx + len(orgName) - 1] + 1
                # TODO additional condtitions for spaces!!
                if startChar >= 0 and endChar >= 0:
                    if startChar in tmpRes:
                        tmpRes[startChar] = max(tmpRes[startChar], endChar)
                    else:
                        tmpRes[startChar] = endChar

        resAnnot = []

        for startChar in tmpRes:
            endChar = tmpRes[startChar]
            resAnnot.append(Annotation(startChar, endChar, 'OrgDict'))

        return resAnnot
Example #19
0
writer.add_document(id=u'guten02',
                    path=u'gutenberg/austen-persuasion.txt',
                    source=u'austen-persuasion.txt',
                    author=u'Jane Austen',
                    title=u'Chapter 1',
                    text=io.open('gutenberg/austen-persuasion.txt',
                                 encoding='utf-8').read())

writer.add_document(
    id=u'guten03',
    path=u'gutenberg/blake-poems.txt',
    source=u'blake-poems.txt',
    author=u'William Blake',
    title=u'SONGS OF INNOCENCE AND OF EXPERIENCE and THE BOOK of THEL',
    text=io.open('gutenberg/austen-persuasion.txt', encoding='utf-8').read())
writer.commit()

#=============Query===========
index = open_dir("index")
searcher = index.searcher()
query = And([Term("text", "song"), Term("text", "wild")])

results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])

parser = QueryParser("text", index.schema)
parser.parse("song wild person")
parser.parse("(song OR wild) AND (song OR austen)")
parser.parse("song wild author:'William Blake'")
Example #20
0
 def restrict_query(self, request):
     return Or([
         And([Term('public', 't'), Term('searchable', 't')]),
         Term('users', request.user.username)
     ] + [Term('groups', group.name) for group in request.user.groups.all()])
Example #21
0
if not os.path.exists('index'):     #如果目录index不存在则创建
    os.mkdir('index') 
ix = create_in("index",schema)      #按照schema模式建立索引目录
ix = open_dir("index")              #打开该目录以便存储索引文件

import pymysql
db = pymysql.connect(host='localhost',user="******",password='******',database='lol',port=3306,charset='utf8')
cursor = db.cursor()
sql = "select * from test"
try:
    cursor.execute(sql)
    row = cursor.fetchall()  
except:
    print("error")

writer = ix.writer()
for i in range(10):
    path = row[i][2]
    title = row[i][10]
    content = row[i][5]
    writer.add_document(path=path,title=title,content=content)
writer.commit()

from whoosh.qparser import QueryParser
from whoosh.query import And, Or, Term
ix = open_dir("index")
with ix.searcher() as searcher:
    query = And([Term('content',u'框架'), Term('content', u'服务')])    # 解析查询字符串后,生成一个`query`对象。
    res = searcher.search(query)
    print(res[0])
Example #22
0
    def perform_search(self, sentence):
        with self._searcher() as s:
            tokens = sentence.split()
            tokens = [token for token in tokens if token != REPLACED]
            print('tokens=', tokens)
            exact_and_match = And([Term(TEXT_FIELD, t) for t in tokens],
                                  boost=.5)
            exact_or_match = Or([Term(TEXT_FIELD, t) for t in tokens],
                                boost=.5,
                                scale=0.9)
            # Added variability of maxdist based on word length
            fuzzy_or_match = Or([
                FuzzyTerm(TEXT_FIELD,
                          t,
                          prefixlength=1,
                          maxdist=1 if len(t) < 8 else 2)
                for t in tokens if len(t) >= 4
            ],
                                boost=.2,
                                scale=0.9)
            if len(tokens) > 1:
                # add bigrams if there are any
                bigrams = ['_'.join(b) for b in find_ngrams(tokens, 2)]
                bigram_fuzzy_or_match = Or([
                    FuzzyTerm(BIGRAMS_FIELD,
                              b,
                              prefixlength=3,
                              maxdist=2 if len(b) < 8 else 3) for b in bigrams
                ],
                                           scale=0.9)
            else:
                bigram_fuzzy_or_match = None

            non_brand_or_match = Or(
                [Term(NONBRAND_TEXT_FIELD, t) for t in tokens])

            # q = exact_and_match \
            # | exact_or_match \
            # | fuzzy_or_match

            # my_match = Or([Term(f, token) for token in tokens], boost=1)
            # q = my_match

            #
            # q = Or([FuzzyTerm(f, token, prefixlength=2) for token in tokens if len(token) >= 3], boost=1.0,
            #                    scale=0.9)

            q = exact_and_match | exact_or_match | fuzzy_or_match | non_brand_or_match

            if bigram_fuzzy_or_match:
                q = q | bigram_fuzzy_or_match

            print(q)
            search_results = self.get_search_results(self._index, s, q)

            for x in search_results:
                print(x, x.score)

            if search_results:
                score, text, matched = search_results[0].items()
                return text, list(set(matched))
            else:
                return None, None
Example #23
0
    def search(self, role, category, title, prev_turns):
        query = self.get_query(prev_turns[-1 * self.context_size:], title)
        # Only consider buyer/seller utterances
        filter_query = And(
            [Term('role', unicode(role)),
             Term('category', unicode(category))])
        start_time = time.time()
        with self.ix.searcher() as searcher:
            results = searcher.search(query,
                                      filter=filter_query,
                                      limit=self.num_candidates,
                                      terms=True)
            # One more try
            if len(results) == 0:
                query = self.get_query(
                    prev_turns[-1 * (self.context_size + 1):], title)
                results = searcher.search(query,
                                          filter=filter_query,
                                          limit=self.num_candidates,
                                          terms=True)

            results = self.remove_duplicates(results)
            results = [{
                'response': r['response'],
                'context': r['immediate_context'],
                'hits': [x[1] for x in r.matched_terms()],
                'pos': r['pos'],
            } for r in results]

        # Sort by BLEU
        ref = self.process_turn(prev_turns[-1]).split()
        results = sorted(results,
                         key=lambda r: compute_bleu(r['context'], ref),
                         reverse=True)

        offered = markers.OFFER in prev_turns[-1]
        if not offered:
            results = [
                r for r in results if not (markers.ACCEPT in r['response']
                                           or markers.REJECT in r['response'])
            ]
        else:
            results = [
                r for r in results if (markers.ACCEPT in r['response']
                                       or markers.REJECT in r['response'])
            ]
            if len(results) == 0:
                results.append({
                    'response': [markers.ACCEPT],
                    'context': [],
                    'hits': []
                })
                results.append({
                    'response': [markers.REJECT],
                    'context': [],
                    'hits': []
                })

        n = len(results)
        if n == 0:
            self.num_empty += 1

        #if n < self.num_candidates:
        #    results.extend([{} for _ in xrange(self.num_candidates - n)])

        self.num_query += 1
        self.search_time += (time.time() - start_time)
        return results
def test_replace():
    q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2),
             Variations("a", "b", boost=2.0)])
    q = q.replace("a", "b", "BB")
    assert q == And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2),
                     Variations("a", "BB", boost=2.0)])
def test_merge_ranges():
    q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))])
    assert q.normalize() == TermRange("f1", u("a"), u("z"))

    q = And([NumericRange("f1", None, u("aaaaa")),
             NumericRange("f1", u("zzzzz"), None)])
    assert q.normalize() == q

    q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")])
    assert q.normalize() == TermRange("f1", u("a"), u("z"))

    q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))])
    assert q.normalize() == TermRange("f1", u("f"), u("m"))

    q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))])
    assert q.normalize() == TermRange("f1", u("a"), u("q"))

    q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))])
    assert q.normalize() == Every("f1")

    q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")])
    assert q.normalize() == Every("f1")

    q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None),
            TermRange("f1", None, u("n"))])
    assert q.normalize() == Every("f1")

    q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")])
    assert q.normalize() == Every("f1")

    q = And([Term("f1", u("a")), And([Or([Every("f1")])])])
    assert q.normalize() == Every("f1")
Example #26
0
def test_merge_ranges():
    q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))])
    assert_equal(q.normalize(), TermRange("f1", u("a"), u("z")))

    q = And([NumericRange("f1", None, u("aaaaa")), NumericRange("f1", u("zzzzz"), None)])
    assert_equal(q.normalize(), q)

    q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")])
    assert_equal(q.normalize(), TermRange("f1", u("a"), u("z")))

    q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))])
    assert_equal(q.normalize(), TermRange("f1", u("f"), u("m")))

    q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))])
    assert_equal(q.normalize(), TermRange("f1", u("a"), u("q")))

    q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))])
    assert_equal(q.normalize(), Every("f1"))

    q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")])
    assert_equal(q.normalize(), Every("f1"))

    q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))])
    assert_equal(q.normalize(), Every("f1"))

    q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")])
    assert_equal(q.normalize(), Every("f1"))

    q = And([Term("f1", u("a")), And([Or([Every("f1")])])])
    assert_equal(q.normalize(), Every("f1"))
def test_duplicates():
    q = And([Term("a", u("b")), Term("a", u("b"))])
    assert q.normalize() == Term("a", u("b"))

    q = And([Prefix("a", u("b")), Prefix("a", u("b"))])
    assert q.normalize() == Prefix("a", u("b"))

    q = And([Variations("a", u("b")), And([Variations("a", u("b")),
                                           Term("a", u("b"))])])
    assert q.normalize() == And([Variations("a", u("b")), Term("a", u("b"))])

    q = And([Term("a", u("b")), Prefix("a", u("b")),
             Term("a", u("b"), boost=1.1)])
    assert q.normalize() == q

    # Wildcard without * or ? normalizes to Term
    q = And([Wildcard("a", u("b")),
             And([Wildcard("a", u("b")), Term("a", u("b"))])])
    assert q.normalize() == Term("a", u("b"))
Example #28
0
def test_replace():
    q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0)])
    q = q.replace("a", "b", "BB")
    assert_equal(q, And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2),
                         Variations("a", "BB", boost=2.0)]))
Example #29
0
# Write a python program that takes queries (you need to design the supported queries) 
# and search through the indexed archive using whoosh. A sample query to the program 
# can be: RT:yes, keywords returns all the retweets that are related to the keywords. 
# Your program should handle at least 4 queries ( of your choice) similar to the sample query.

from whoosh.query import Term, And, Or
from whoosh.qparser import QueryParser
searcher = index.searcher()

parser = QueryParser("strong_hashtags", index.schema)
parser.parse("FIFAWWC USA JPN")


# Query 1: Player search
query = And([Term("tweet_text","tobin"),Term("tweet_text","heath")])
results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])

# Query 2: Player search
query = And([Term("tweet_text","alex"),Term("tweet_text","morgan")])
results = searcher.search(query)
print('# of hits:', len(results))
print('Best Match:', results[0])

# Query 3: USA JPN 
parser = QueryParser("strong_hashtags", index.schema)
query = parser.parse("USA JPN")
results = searcher.search(query)
print('# of hits:', len(results))