Python IdxINDEX.idxPHRASER Examples

Programming Language: Python

Namespace/Package Name: invenio.modules.indexer.models

Class/Type: IdxINDEX

Method/Function: idxPHRASER

Examples at hotexamples.com: 2

Python IdxINDEX.idxPHRASER - 2 examples found. These are the top rated real world Python examples of invenio.modules.indexer.models.IdxINDEX.idxPHRASER extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get_from_field(3)

idxPHRASEF(2)

get_idxpair_field_ids(1)

get_index_id_from_field(1)

idxPHRASER(1)

Example #1

Show file

def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' in idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer)
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append((column.between(pairs_left[-1],
                                              pairs_right[-1]), True))
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [
                        term for term in termlist
                        if term.lower().find(p.lower()) > -1
                ]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()

Example #2

Show file

File: native.py Project: SCOAP3/invenio

def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' inside idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer
    )
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append(
                (column.between(pairs_left[-1], pairs_right[-1]), True)
            )
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [term for term in termlist
                        if term.lower().find(p.lower()) > -1]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()