def query_search(indexdir, queries, n=10, function='BM25F'):
    ix = index.open_dir(indexdir)
    search_fields = ['resname', 'categories', 'address', 'city',
                     'state']  # search fields
    og = qparser.OrGroup.factory(0.9)
    qp = MultifieldParser(search_fields,
                          ix.schema,
                          termclass=query.Variations,
                          group=og)
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(queries)
    result_index = []
    if function == 'BM25F':
        with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    if function == 'TF_IDF':
        with ix.searcher(weighting=scoring.TF_IDF()) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    return result_index
def person_query_search(indexdir, queries, user_id, E, n=10, function='BM25F'):
    prediction = user_cf(E, user_id, 3)
    ix = index.open_dir(indexdir)
    search_fields = ['resname', 'categories', 'address', 'city',
                     'state']  # search fields
    og = qparser.OrGroup.factory(0.9)
    qp = MultifieldParser(search_fields,
                          ix.schema,
                          termclass=query.Variations,
                          group=og)
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(queries)
    result_index = []
    if function == 'BM25F':
        # with ix.searcher(weighting=scoring.BM25F(B=0.75, resname_B = 1.0, categories_B = 0.8, K1=1.2)) as s:
        # add weight for the resname and the categories_B
        with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s:
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=None, sortedby=[scores])
            m = len(results)
            if m != 0:
                relevance = np.zeros(m)
                expected = np.zeros(m)
                for i in range(m):
                    relevance[i] = -results[i].score
                relevance = (relevance - relevance.min()) / (relevance.max() -
                                                             relevance.min())
                # normalized score from 0 to 1
                for i in range(m):
                    expected[i] = relevance[i] * prediction[int(
                        results[i]['ID'])]
                indorder = np.argsort(expected)
                k = min(m, n)
                for i in range(k):
                    result_index.append(int(results[indorder[-1 - i]]['ID']))
    if function == 'TF_IDF':
        with ix.searcher(weighting=scoring.TF_IDF()) as s:
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=m, sortedby=[scores])
            m = len(results)
            if m != 0:
                relevance = np.zeros(m)
                expected = np.zeros(m)
                for i in range(m):
                    relevance[i] = -results[i].score
                relevance = (relevance - relevance.min()) / (
                    relevance.max() - relevance.min()
                )  # normalized score from 0 to 1
                for i in range(m):
                    expected[i] = relevance[i] * prediction[int(
                        results[i]['ID'])]
                indorder = np.argsort(expected)
                k = min(m, n)
                for i in range(k):
                    result_index.append(int(results[indorder[-1 - i]]['ID']))
    return result_index
Exemple #3
0
def search(humanReadableId):
    query = request.args.get('q', '').strip()
    pagination = None
    if query:
        index_base_dir = config().get_path("ZIM", "wikipedia_index_dir")
        index_dir = os.path.join(index_base_dir, humanReadableId)
        page = int(request.args.get('page', 1))

        # Load index so we can query it for which fields exist
        ix = whoosh_open_dir_32_or_64(index_dir)

        # Set a higher value for the title field so it is weighted more
        weighting = scoring.BM25F(title_B=1.0)

        # Sort pages with "Image:" in their title after
        # regular articles
        def image_pages_last(searcher, docnum):
            fields = searcher.stored_fields(docnum)
            if fields['title'].find("Image:") == 0:
                return 1
            else:
                return 0

        # Support older whoosh indexes that do not have a reverse_links field
        if 'reverse_links' in ix.schema.names():
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
                sorting.FieldFacet("reverse_links", reverse=True),
            ])
        else:
            sortedby = sorting.MultiFacet([
                sorting.FunctionFacet(image_pages_last),
                sorting.ScoreFacet(),
            ])

        (pagination, suggestion) = paginated_search(ix, ["title", "content"],
                                                    query,
                                                    page,
                                                    weighting=weighting,
                                                    sort_column=sortedby)
    else:
        flash(_('Please input keyword(s)'), 'error')

    return render_template('zim/search.html',
                           humanReadableId=humanReadableId,
                           pagination=pagination,
                           suggestion=suggestion,
                           keywords=query,
                           endpoint_desc=EndPointDescription(
                               'zim_views.search',
                               {'humanReadableId': humanReadableId}))
Exemple #4
0
def post_search(pn=1, size=10):
    """

    :param pn:
    :param size:
    :return:
    """
    keyword = request.values.get('kw')
    if keyword is None:
        return render_template('search/list.html',
                               title='搜索',
                               message='搜索关键字不能为空')
    with whoosh_searcher.get_searcher('posts') as searcher:
        parser = qparser.MultifieldParser(
            ['title', 'content'],
            whoosh_searcher.get_index('posts').schema)
        q = parser.parser(keyword)
        result = searcher.search_page(q,
                                      pagenum=pn,
                                      pagelen=size,
                                      sortedby=sorting.ScoreFacet())
        result_list = [x.fields() for x in result.results]
        page = models.Page(page_num=pn,
                           per_page=size,
                           result_list=result_list,
                           has_more=result.pagecount > pn,
                           total_page=result.pagecount,
                           total=result.total)
    return render_template('search/list.html',
                           title=keyword + '搜索结果',
                           page=page,
                           kw=keyword)
Exemple #5
0
def post_search(pn=1, size=10):
    keyword = request.values.get('kw')
    if keyword is None:
        return render_template('search/list.html',
                               title='搜索',
                               message='搜索关键字不能为空!')
    with whoosh_searcher.get_searcher('posts') as searcher:
        # q = query.Or([query.Term('title', keyword), query.Term('content', keyword)])
        parser = qparser.MultifieldParser(
            ['title', 'content'],
            whoosh_searcher.get_index('posts').schema)
        q = parser.parse(keyword)
        result = searcher.search_page(q,
                                      pagenum=pn,
                                      pagelen=size,
                                      sortedby=sorting.ScoreFacet())
        result_list = [x.fields() for x in result.results]
        page = models.Page(pn,
                           size,
                           result=result_list,
                           has_more=result.pagecount > pn,
                           total_page=result.pagecount,
                           total=result.total)
        print(page.result)
    # return jsonify(page)
    return render_template('search/list.html',
                           title=keyword + '搜索结果',
                           page=page,
                           kw=keyword)
Exemple #6
0
def listarPorAtributo(busqueda="",
                      categoria=[],
                      order="",
                      groupDic={},
                      nElementosPagina=20,
                      pagina=1):
    tam = 0
    ix = index.open_dir("whooshLicor/licoresIndex")
    lista = []
    busqueda = busqueda.strip()
    with ix.searcher() as searcher:
        if (not (busqueda) and not (categoria)):
            query = QueryParser("titulo", ix.schema).parse("*")
        elif (not (busqueda) and categoria):
            query = QueryParser(
                "titulo",
                ix.schema).parse("*") & queryCategoryGenerator(categoria)
        elif (busqueda and not (categoria)):
            query = querySearchGenerator(busqueda)
        elif (busqueda and categoria):
            query = querySearchGenerator(busqueda) & queryCategoryGenerator(
                categoria)

        query.normalize()
        if not order:
            order = sorting.ScoreFacet()
        groupMap = agruparLista(groupDic)
        results = searcher.search(query,
                                  groupedby=groupMap,
                                  sortedby=[faceta_enStock(), order],
                                  limit=4000)
        grupo = range(0, searcher.doc_count())
        tam = len(results)
        if (groupMap):
            try:
                if "precio/graduacion" in groupMap.keys():
                    tuplaKey = (groupDic["precio"], groupDic["graduacion"])
                elif ("precio" in groupMap.keys()):
                    tuplaKey = groupDic["precio"]
                else:
                    tuplaKey = groupDic["graduacion"]

                grupo = results.groups(next(iter(groupMap)))[tuplaKey]
            except:
                grupo = []

            for documentIndex in grupo[(pagina - 1) * nElementosPagina:pagina *
                                       nElementosPagina]:
                elemento = searcher.stored_fields(documentIndex)
                lista.append(elemento['id'])
            tam = len(grupo)
        elif not (groupDic):
            for r in results[(pagina - 1) * nElementosPagina:pagina *
                             nElementosPagina]:
                lista.append(r['id'])
        return (lista, tam)
Exemple #7
0
    def search_name(self, key_word):
        qp = QueryParser("novelName", schema=self.ix.schema)
        q = qp.parse(key_word)

        # score
        scores = sorting.ScoreFacet()

        results = self.searcher.search(q, limit=LIMIT, sortedby=[scores])
        print(len(results))
        for i in results:
            print(i)

        return results
Exemple #8
0
    def search(self, query_string, page="1", limit=20):
        results = []
        query_string = unicode(query_string, 'utf-8')
        with self.index.searcher() as searcher:
            query = QueryParser("content",
                                self.index.schema).parse(query_string)

            scores = sorting.ScoreFacet()
            sortperson = sorting.FieldFacet("person")
            sortcollection = sorting.FieldFacet("collection", reverse=True)

            resultset = searcher.search_page(
                query,
                int(page),
                pagelen=int(limit),
                sortedby=[sortcollection, scores, sortperson])
            # NOTE: Need to copy plain dicts out, since once the searcher
            #   dies (end of with block), the Hit results lose their reference to
            #   the data.
            for hit in resultset[0:]:
                # Grab a copy of the results as a plain dict.
                result = hit.fields()

                # Also grab the surrounding fragment as a highlight.
                # NOTE: This is pretty much the only point we know
                #   "where" in the matched document the hit occurs.
                #   The raw content we indexed is stored in 'content',
                #   so we tell the Hit instance to pull the surrounding
                #   text fragments from there.
                # Also:
                #   These highlights are pretty much the only reason
                #   we need to bother stashing the entire document.
                #   Otherwise, the index can be even smaller.
                #   Whoosh allows to hunt for the content in the
                #   original files, if they're available.  But as our
                #   text content isn't large -- keeping it in the
                #   index seems faster.
                result['highlights'] = hit.highlights('content')
                results.append(result)

            results = {
                'matches': results,
                'matches_returned': resultset.scored_length(),
                'total_matches': len(resultset),
                'query': query_string
            }
        return results
Exemple #9
0
    def search(self, key_word):
        # qp = QueryParser("novelName", schema=self.ix.schema)
        qp = MultifieldParser(
            ["novelName", "novelAuthor", "novelIntroduction"],
            schema=self.ix.schema)
        q = qp.parse(key_word)

        # score
        scores = sorting.ScoreFacet()

        results = self.searcher.search(q, limit=LIMIT, sortedby=[scores])
        print(len(results))
        for i in results:
            print(i)
            # print(i.highlights())
            # print(i.more_like_this("novelAuthor"))

        return results
Exemple #10
0
def autocomplete(query_str, results=10):
    query_str = u' '.join([
        t.text for t in _analyzer(query_str)
        if not 'university'.startswith(t.text)
    ])

    q = _query_parser.parse(query_str)
    return [
        _ror_rows[row['ror']] for row in _searcher.search_page(
            q,
            1,
            results,
            sortedby=[
                sorting.FieldFacet('citation_score', reverse=True),
                sorting.FieldFacet('num_students', reverse=True),
                sorting.ScoreFacet(),
            ])
    ]
def test_sorted_extend():
    from whoosh import sorting

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           keywords=fields.TEXT,
                           num=fields.NUMERIC(stored=True, sortable=True))
    domain = u"alfa bravo charlie delta echo foxtrot golf hotel india".split()
    keys = u"juliet kilo lima november oskar papa quebec romeo".split()

    combined = 0
    tcount = 0
    kcount = 0
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            for i, words in enumerate(permutations(domain, 3)):
                key = keys[i % (len(domain) - 1)]
                if "bravo" in words:
                    tcount += 1
                if key == "kilo":
                    kcount += 1
                if "bravo" in words or key == "kilo":
                    combined += 1

                w.add_document(title=u" ".join(words), keywords=key, num=i)

        with ix.searcher() as s:
            facet = sorting.MultiFacet([
                sorting.FieldFacet("num", reverse=True),
                sorting.ScoreFacet()
            ])

            r1 = s.search(query.Term("title", "bravo"),
                          limit=None,
                          sortedby=facet)
            r2 = s.search(query.Term("keywords", "kilo"),
                          limit=None,
                          sortedby=facet)

            assert len(r1) == tcount
            assert len(r2) == kcount
            r1.extend(r2)
            assert len(r1) == combined
Exemple #12
0
def post_search(pn=1, size=10):
    keyword = request.values.get('kw')
    if keyword is None:
        return render_template('search/list.html',
                               title='搜索',
                               message='搜索关键字不能为空!')
    whoosh_searcher.clear('posts')
    writer = whoosh_searcher.get_writer('posts')
    for item in mongo.db['posts'].find(
        {}, ['_id', 'title', 'content', 'create_at', 'user_id', 'catalog_id']):
        item['obj_id'] = str(item['_id'])
        item['user_id'] = str(item['user_id'])
        item['catalog_id'] = str(item['catalog_id'])
        item.pop('_id')
        writer.add_document(**item)
    # 保存修改
    writer.commit()
    with whoosh_searcher.get_searcher('posts') as searcher:
        # 解析查询字符串
        parser = qparser.MultifieldParser(
            ['title', 'content'],
            whoosh_searcher.get_index('posts').schema)
        q = parser.parse(keyword)
        print('q:', q)
        # 搜索得到结果
        result = searcher.search_page(q,
                                      pagenum=pn,
                                      pagelen=size,
                                      sortedby=sorting.ScoreFacet())
        result_list = [x.fields() for x in result.results]
        # 构建页面对象
        page = Page(pn,
                    size,
                    result=result_list,
                    has_more=result.pagecount > pn,
                    page_count=result.pagecount,
                    total=result.total)
    return render_template('search/list.html',
                           title=keyword + '搜索结果',
                           page=page,
                           kw=keyword)
Exemple #13
0
def test_score_facet():
    schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT,
                           c=fields.ID)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c"))
    w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c"))
    w.commit()
    w = ix.writer()
    w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c"))
    w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c"))
    w.commit(merge=False)
    w = ix.writer()
    w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c"))
    w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c"))
    w.commit(merge=False)

    with ix.searcher() as s:
        facet = sorting.MultiFacet(["b", sorting.ScoreFacet()])
        r = s.search(q=query.Term("a", u("alfa")), sortedby=facet)
        assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3]
Exemple #14
0
 def search(self, parameter):
     # 提取查询字段,创建检索器
     keys = parameter['keys']
     parser = None
     if len(keys) == 1:
         parser = QueryParser(keys[0], schema=self.index.schema)
     elif len(keys) > 1:
         parser = MultifieldParser(keys, schema=self.index.schema)
     # 搜索参数(排序、分页)
     score = sorting.ScoreFacet()  # 相关度
     id = sorting.FieldFacet('id', reverse=False)  # 标题字段
     _limit = None  # 分页限制
     if 'page' in parameter and 'pagesize' in parameter:
         page = parameter['page']
         pagesize = parameter['pagesize']
         if page > 0 and pagesize != 0:
             _limit = page * pagesize
     # 执行搜索
     query = parser.parse(parameter['keywords'])
     result = self.searcher.search(
         query,
         limit=_limit,
         sortedby=[score]  # 按相关度排序
     )
     # 返回结果
     res = list()
     for hit in result:
         res.append({
             'title':
             hit['title'],
             'url':
             hit['url'],
             'content':
             re.sub(r'<[^>]+>', ' | ', hit.highlights('content'), re.S),
             'score':
             str(hit.score)
         })
     return res
Exemple #15
0
# coding=utf-8
from whoosh.qparser import QueryParser
from whoosh import qparser, sorting
from whoosh.index import open_dir
from whoosh.sorting import FieldFacet

index_filepath = "./index/"
# source_filepath=index_filepath+"0407_songs_dr2.csv"
default_index = open_dir(index_filepath, indexname='book')  # 读取建立好的索引

# 默认排序为得分+album+song
default_facet = []
default_facet.append(sorting.ScoreFacet())
# default_facet.append(FieldFacet("album_title", reverse=True))  # 按序排列搜索结果
default_facet.append(FieldFacet("book_tittle", reverse=True))

# 默认查询为and模式,默认范围为全选
default_group = qparser.syntax.AndGroup
default_range = ['book_tittle', 'book_author', 'year', 'publisher', 'ISBN']


# 基本的单曲查询
def basic_search(query,
                 query_parse,
                 group=default_group,
                 facet=default_facet,
                 index=default_index):
    searcher = index.searcher()
    parser = QueryParser(query_parse, index.schema, group=group)
    myquery = parser.parse(query)
    parser.remove_plugin_class(qparser.PhrasePlugin)
from whoosh.qparser import QueryParser
from whoosh import index, sorting, scoring
from whoosh import qparser
from config import SEARCH_INDEX_DIR
import math

from test_search import CWeighting
ix = index.open_dir(SEARCH_INDEX_DIR)
qp = QueryParser("name", schema=ix.schema, group=qparser.OrGroup)

facet = sorting.FieldFacet("zvalue", reverse=True)
scores = sorting.ScoreFacet()


def do_search(txt, sumlevel=None, kind=None, tries=0):
    if kind:
        txt += " AND kind:{}".format(kind)
    if sumlevel:
        txt += " AND sumlevel:{}".format(sumlevel)
    if tries > 2:
        return [], []
    q = qp.parse(txt)

    with ix.searcher(weighting=CWeighting(txt)) as s:
        corrector = s.corrector("display")
        suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3)
        results = s.search(q, sortedby=[scores])
        data = [[
            r["id"], r["name"], r["zvalue"], r["kind"], r["display"],
            r["sumlevel"]
        ] for r in results]
Exemple #17
0
def sortResults(sortMethod, searcher, query):
    # ascending order by price
    if sortMethod == "1":
        try:
            results = searcher.search(query, limit=20, sortedby="price")
        except TermNotFound:
            results = []
    # descending order by price
    elif sortMethod == "2":
        try:
            results = searcher.search(query,
                                      limit=20,
                                      sortedby="price",
                                      reverse=True)
        except TermNotFound:
            results = []
    # sort by review polarity value with highest values first
    elif sortMethod == "3":
        try:
            results = searcher.search(query,
                                      limit=20,
                                      sortedby="reviewPolarity",
                                      reverse=True)
        except TermNotFound:
            results = []
    # sort by rating with highest values first
    elif sortMethod == "4":
        try:
            results = searcher.search(query,
                                      limit=20,
                                      sortedby="rating",
                                      reverse=True)
        except TermNotFound:
            results = []
    # sort by highest savings absolute value
    elif sortMethod == "5":
        try:
            results = searcher.search(query,
                                      limit=20,
                                      sortedby="savings",
                                      reverse=True)
        except TermNotFound:
            results = []
    # sort by highest savings percentage
    elif sortMethod == "6":
        try:
            results = searcher.search(query,
                                      limit=20,
                                      sortedby="percentageSavings",
                                      reverse=True)
        except TermNotFound:
            results = []
    # sort by highest review polarity value and highest ratings
    elif sortMethod == "7":
        try:
            rP = sorting.FieldFacet("reviewPolarity", reverse=True)
            r = sorting.FieldFacet("rating", reverse=True)
            weightedValue = sorting.TranslateFacet(average, rP, r)
            results = searcher.search(query, limit=20, sortedby=weightedValue)
        except TermNotFound:
            results = []
    # sort by lowest price, highest review polarity value, highest ratings
    elif sortMethod == "8":
        try:
            rP = sorting.FieldFacet("reviewPolarity", reverse=True)
            r = sorting.FieldFacet("rating", reverse=True)
            weightedValue = sorting.TranslateFacet(average, rP, r)
            results = searcher.search(query,
                                      limit=20,
                                      sortedby=[weightedValue, "price"])
        except TermNotFound:
            results = []
    elif sortMethod == "9":
        try:
            priceRangeLow = int(input("Enter the minimum price : "))
            priceRangeHigh = int(input("Enter the maximum price : "))
            numResultsDisplayed = int(
                input("Enter the number of records to be shown : "))
            results = searcher.search(query, limit=None, sortedby="price")
        except TermNotFound:
            results = []
    elif sortMethod == "10":
        try:
            countryData = input("Enter the country of origin : ")
            if re.match(r"(?i)(us)", countryData):
                countryData = "usa"
            numResultsDisplayed = int(
                input("Enter the number of records to be shown : "))
            scores = sorting.ScoreFacet()
            results = searcher.search(qp.parse(user_query + " " + countryData),
                                      limit=None,
                                      sortedby=scores)
        except TermNotFound:
            results = []

    return results