Esempio n. 1
0
def search(humanReadableId):
    query = request.args.get('q', '').strip()
    pagination = None
    if query:
        index_base_dir = config().get_path("ZIM", "wikipedia_index_dir")
        index_dir = os.path.join(index_base_dir, humanReadableId)
        page = int(request.args.get('page', 1))

        # Load index so we can query it for which fields exist
        ix = whoosh_open_dir_32_or_64(index_dir)

        # Set a higher value for the title field so it is weighted more
        weighting = scoring.BM25F(title_B=1.0)

        # Sort pages with "Image:" in their title after
        # regular articles
        def image_pages_last(searcher, docnum):
            fields = searcher.stored_fields(docnum)
            if fields['title'].find("Image:") == 0:
                return 1
            else:
                return 0

        # Support older whoosh indexes that do not have a reverse_links field
        if 'reverse_links' in ix.schema.names():
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
                sorting.FieldFacet("reverse_links", reverse=True),
            ])
        else:
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
            ])

        (pagination, suggestion) = paginated_search(ix, ["title", "content"],
                                                    query,
                                                    page,
                                                    weighting=weighting,
                                                    sort_column=sortedby)
    else:
        flash(_('Please input keyword(s)'), 'error')

    return render_template('zim/search.html',
                           humanReadableId=humanReadableId,
                           pagination=pagination,
                           suggestion=suggestion,
                           keywords=query,
                           endpoint_desc=EndPointDescription(
                               'zim_views.search',
                               {'humanReadableId': humanReadableId}))
Esempio n. 2
0
    def test(ix):
        with ix.searcher() as s:
            # Sort by title
            r = s.search(query.Every(), sortedby="title")
            assert [hit["title"] for hit in r] == sorted_titles

            # Sort by reverse title
            facet = sorting.FieldFacet("title", reverse=True)
            r = s.search(query.Every(), sortedby=facet)
            assert [hit["title"] for hit in r] == list(reversed(sorted_titles))

            # Sort by num (-10 to 10) first, and within that, by reverse title
            facet = sorting.MultiFacet()
            facet.add_field("num")
            facet.add_field("title", reverse=True)

            r = s.search(query.Every(), sortedby=facet)
            target = ["Visual and Statistical Thinking",
                      "Cognitive Style of Powerpoint",
                      "Beautiful Evidence",
                      "Visual Explanations",
                      "Visual Display of Quantitative Information, The",
                      "Envisioning Information",
                      ]
            assert [hit["title"] for hit in r] == target
Esempio n. 3
0
def test_multifacet():
    schema = fields.Schema(tag=fields.ID(stored=True),
                           size=fields.ID(stored=True))
    with TempIndex(schema, "multifacet") as ix:
        w = ix.writer()
        w.add_document(tag=u("alfa"), size=u("small"))
        w.add_document(tag=u("bravo"), size=u("medium"))
        w.add_document(tag=u("alfa"), size=u("large"))
        w.add_document(tag=u("bravo"), size=u("small"))
        w.add_document(tag=u("alfa"), size=u("medium"))
        w.add_document(tag=u("bravo"), size=u("medium"))
        w.commit()

        correct = {
            (u('bravo'), u('medium')): [1, 5],
            (u('alfa'), u('large')): [2],
            (u('alfa'), u('medium')): [4],
            (u('alfa'), u('small')): [0],
            (u('bravo'), u('small')): [3]
        }

        with ix.searcher() as s:
            facet = sorting.MultiFacet(["tag", "size"])
            r = s.search(query.Every(), groupedby={"tag/size": facet})
            cats = r.groups(("tag/size"))
            assert_equal(cats, correct)
Esempio n. 4
0
def search(term, ix='indexdir', limit=None):
    # Load the index.
    if isinstance(ix, str):
        ix = index.open_dir(ix)

    # Parse the search terms.
    s = ix.searcher()
    parser = QueryParser('line', schema=ix.schema)
    q = parser.parse(term)

    # Search and sort the results.
    mf = sorting.MultiFacet()
    mf.add_field('filename')
    mf.add_field('number')
    return s.search(q, limit=limit, sortedby=mf)
Esempio n. 5
0
    def search(self,
               query,
               section=None,
               page=1,
               per_page=20,
               excerpt_fragmenter=None,
               excerpt_maxchars=None,
               excerpt_surround=None):
        qp = MultifieldParser(['title', 'content'], self.schema)
        q = qp.parse(unicode(query))
        mf = sorting.MultiFacet()
        mf.add_field("priority", reverse=True)

        if section is not None:
            q = And([q, Term('section', unicode(section))])

        def _make_item(hit):
            text = self.get_content(hit['path'], hit['section'])
            if text is not None:
                excerpt = hit.highlights('content', text=text)
            else:
                excerpt = None
            return {
                'path': hit['path'],
                'title': hit['title'],
                'excerpt': excerpt,
                'section': section,
            }

        with self.whoosh_index.searcher() as searcher:
            rv = searcher.search_page(q, page, sortedby=mf, pagelen=per_page)
            frag, anal = make_fragmenter_and_analyzer(excerpt_fragmenter,
                                                      excerpt_maxchars,
                                                      excerpt_surround)
            rv.results.formatter = make_html_formatter()
            if frag is not None:
                rv.results.fragmenter = frag
            if anal is not None:
                rv.results.analyzer = anal
            return {
                'items': [_make_item(x) for x in rv.results],
                'pages': rv.pagecount,
                'page': page,
                'per_page': per_page
            }
def test_sorted_extend():
    from whoosh import sorting

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           keywords=fields.TEXT,
                           num=fields.NUMERIC(stored=True, sortable=True))
    domain = u"alfa bravo charlie delta echo foxtrot golf hotel india".split()
    keys = u"juliet kilo lima november oskar papa quebec romeo".split()

    combined = 0
    tcount = 0
    kcount = 0
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            for i, words in enumerate(permutations(domain, 3)):
                key = keys[i % (len(domain) - 1)]
                if "bravo" in words:
                    tcount += 1
                if key == "kilo":
                    kcount += 1
                if "bravo" in words or key == "kilo":
                    combined += 1

                w.add_document(title=u" ".join(words), keywords=key, num=i)

        with ix.searcher() as s:
            facet = sorting.MultiFacet([
                sorting.FieldFacet("num", reverse=True),
                sorting.ScoreFacet()
            ])

            r1 = s.search(query.Term("title", "bravo"),
                          limit=None,
                          sortedby=facet)
            r2 = s.search(query.Term("keywords", "kilo"),
                          limit=None,
                          sortedby=facet)

            assert len(r1) == tcount
            assert len(r2) == kcount
            r1.extend(r2)
            assert len(r1) == combined
Esempio n. 7
0
def test_numeric_field_facet():
    schema = fields.Schema(id=fields.STORED, v1=fields.NUMERIC,
                           v2=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, v1=2, v2=100)
    w.add_document(id=2, v1=1, v2=50)
    w.commit()
    w = ix.writer()
    w.add_document(id=3, v1=2, v2=200)
    w.add_document(id=4, v1=1, v2=100)
    w.commit()
    w = ix.writer(merge=False)
    w.add_document(id=5, v1=2, v2=50)
    w.add_document(id=6, v1=1, v2=200)
    w.commit()

    with ix.searcher() as s:
        mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True)
        r = s.search(query.Every(), sortedby=mf)
        assert [hit["id"] for hit in r] == [6, 4, 2, 3, 1, 5]
Esempio n. 8
0
def test_score_facet():
    schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT,
                           c=fields.ID)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c"))
    w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c"))
    w.commit()
    w = ix.writer()
    w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c"))
    w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c"))
    w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c"))
    w.commit(merge=False)

    with ix.searcher() as s:
        facet = sorting.MultiFacet(["b", sorting.ScoreFacet()])
        r = s.search(q=query.Term("a", u("alfa")), sortedby=facet)
        assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3]
Esempio n. 9
0
def test_multisort():
    mf = sorting.MultiFacet(["tag", "id"])
    try_sort(mf, lambda d: (d["tag"], d["id"]))
    try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True)
    try_sort(mf, lambda d: (d["tag"], d["id"]), limit=5)
    try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True, limit=5)
Esempio n. 10
0
#             # tags=",".join(one['tags']),
#             content=one['index_content']
#         )
#         counts += 1
#         if counts == 200:
#             break



count = 0
start = time.time()
with ix.searcher(weighting=scoring.BM25F()) as searcher:
    query = MultifieldParser(["title", "content"], ix.schema).parse("xss")
    #query = QueryParser("content", ix.schema).parse("xss")

    mf = sorting.MultiFacet()
    mf.add_field("date", reverse=True)

    results = searcher.search(query, limit=10, sortedby=mf)
    #results = searcher.search_page(query, 2, pagelen=10)
    #print(results)
    print(len(results))
    #results = results[-10:]

    for one in results:
        # print(one['content'])
        # print(one.highlights("content"))
        _id = ObjectId(one['nid'])
        res = collections.find({'_id':_id})[0]
        print(res['date'] + res['title'])
        print('-----------------------\n')