Esempio n. 1
0
def default_search_unit(p, f, m, wl):
    """Query correct index type and return hitset."""
    if m == 'a' or m == 'r' or is_marc_tag(f):
        # we are doing either phrase search or regexp search
        index_id = IdxINDEX.get_index_id_from_field(f)
        if index_id != 0:
            if m == 'a' and index_id in IdxINDEX.get_idxpair_field_ids():
                # for exact match on the admin configured fields
                # we are searching in the pair tables
                hitset = search_unit_in_idxpairs(p, f, m or 'a', wl)
            else:
                hitset = search_unit_in_idxphrases(p, f, m or 'a', wl)
        else:
            hitset = search_unit_in_bibxxx(p, f, m or 'a', wl)
    else:
        # we are doing bibwords search by default
        hitset = search_unit_in_bibwords(p, f, wl=wl)
    return hitset
Esempio n. 2
0
def default_search_unit(p, f, m, wl):
    """Query correct index type and return hitset."""
    if m == 'a' or m == 'r' or is_marc_tag(f):
        # we are doing either phrase search or regexp search
        index_id = IdxINDEX.get_index_id_from_field(f)
        if index_id != 0:
            if m == 'a' and index_id in IdxINDEX.get_idxpair_field_ids():
                # for exact match on the admin configured fields
                # we are searching in the pair tables
                hitset = search_unit_in_idxpairs(p, f, m or 'a', wl)
            else:
                hitset = search_unit_in_idxphrases(p, f, m or 'a', wl)
        else:
            hitset = search_unit_in_bibxxx(p, f, m or 'a', wl)
    else:
        # we are doing bibwords search by default
        hitset = search_unit_in_bibwords(p, f, wl=wl)
    return hitset
Esempio n. 3
0
def autocomplete(field, q):
    """Autocomplete data from indexes.

    It uses POSTed arguments with name `q` that has to be longer than 3
    characters in order to returns any results.

    :param field: index name
    :param q: query string for index term

    :return: list of values matching query.
    """
    IdxPHRASE = IdxINDEX.idxPHRASEF(field, fallback=False)
    results = IdxPHRASE.query.filter(
        IdxPHRASE.term.contains(q)).limit(20).values('term')
    results = map(lambda r: {'value': r[0]}, results)

    return jsonify(results=results)
Esempio n. 4
0
def autocomplete(field, q):
    """Autocomplete data from indexes.

    It uses POSTed arguments with name `q` that has to be longer than 3
    characters in order to returns any results.

    :param field: index name
    :param q: query string for index term

    :return: list of values matching query.
    """
    IdxPHRASE = IdxINDEX.idxPHRASEF(field, fallback=False)
    results = IdxPHRASE.query.filter(
        IdxPHRASE.term.contains(q)).limit(20).values('term')
    results = map(lambda r: {'value': r[0]}, results)

    return jsonify(results=results)
Esempio n. 5
0
 def _index_id(self):
     return IdxINDEX.get_from_field('collection').id
Esempio n. 6
0
 def _index_id(self):
     return IdxINDEX.get_from_field('collection').id
Esempio n. 7
0
def search_unit_in_idxphrases(p, f, m, wl=0):
    """Searche for phrase 'p' inside idxPHRASE*F table for field 'f'.

    Return hitset of recIDs found. The search type is defined by 'type'
    (e.g. equals to 'r' for a regexp search).
    """
    # call word search method in some cases:
    if f.endswith('count'):
        return search_unit_in_bibwords(p, f, wl=wl)
    # will hold output result set
    hitset = intbitset()
    # flag for knowing if the query limit has been reached
    limit_reached = 0
    # flag for knowing if to limit the query results or not
    use_query_limit = False
    # deduce in which idxPHRASE table we will search:
    model = IdxINDEX.idxPHRASEF(f, fallback=not f)
    if model is None:
        return intbitset()  # phrase index f does not exist

    # detect query type (exact phrase, partial phrase, regexp):
    if m == 'r':
        use_query_limit = True
        column_filter = lambda column: column.op('REGEXP')(p)
    else:
        p = p.replace('*', '%')  # we now use '*' as the truncation character
        ps = p.split("->", 1)  # check for span query:
        if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
            use_query_limit = True
            column_filter = lambda column: column.between(ps[0], ps[1])
        else:
            if p.find('%') > -1:
                use_query_limit = True
                column_filter = lambda column: column.like(p)
            else:
                column_filter = lambda column: column == p

    # special washing for fuzzy author index:
    # if f in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor',
    #          'authorityauthor'):
    #    query_params_washed = ()
    #    for query_param in query_params:
    #        query_params_washed += (wash_author_name(query_param),)
    #    query_params = query_params_washed

    query = model.query.filter(column_filter(model.term))
    # perform search:
    if use_query_limit and wl > 0:
        query = query.limit(wl)

    results = query.values('hitlist')
    limit_reached = use_query_limit and wl > 0 and len(results) == wl
    # fill the result set:
    for row in results:
        hitset |= intbitset(row[0])
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset
Esempio n. 8
0
def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' in idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer)
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append((column.between(pairs_left[-1],
                                              pairs_right[-1]), True))
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [
                        term for term in termlist
                        if term.lower().find(p.lower()) > -1
                ]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()
Esempio n. 9
0
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0):
    """Search for 'word' inside bibwordsX table for field 'f'.

    :return: hitset of recIDs.
    """
    from invenio.legacy.bibindex.engine_stemmer import stem
    from invenio.legacy.bibindex.engine_washer import (
        lower_index_term,
        wash_index_term,
    )
    # FIXME: Should not be used for journal field.
    hitset = intbitset()  # will hold output result set
    limit_reached = 0  # flag for knowing if the query limit has been reached

    # if no field is specified, search in the global index.
    f = f or 'anyfield'
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return hitset
    model = index.wordf
    stemming_language = index.stemming_language

    # wash 'word' argument and run query:
    if f.endswith('count') and word.endswith('+'):
        # field count query of the form N+ so transform N+ to N->99999:
        word = word[:-1] + '->99999'
    word = word.replace('*', '%')  # we now use '*' as the truncation character
    words = word.split("->", 1)  # check for span query
    if len(words) == 2:
        word0 = re_word.sub('', words[0])
        word1 = re_word.sub('', words[1])
        if stemming_language:
            word0 = lower_index_term(word0)
            word1 = lower_index_term(word1)
            # We remove trailing truncation character before stemming
            if word0.endswith('%'):
                word0 = stem(word0[:-1], stemming_language) + '%'
            else:
                word0 = stem(word0, stemming_language)
            if word1.endswith('%'):
                word1 = stem(word1[:-1], stemming_language) + '%'
            else:
                word1 = stem(word1, stemming_language)

        word0_washed = wash_index_term(word0)
        word1_washed = wash_index_term(word1)
        if f.endswith('count'):
            # field count query; convert to integers in order
            # to have numerical behaviour for 'BETWEEN n1 AND n2' query
            try:
                word0_washed = int(word0_washed)
                word1_washed = int(word1_washed)
            except ValueError:
                pass
        query = model.query.filter(
            model.term.between(word0_washed, word1_washed))
        if wl > 0:
            query = query.limit(wl)
        res = query.values('term', 'hitlist')
        if wl > 0 and len(res) == wl:
            limit_reached = 1  # set the limit reached flag to true
    else:
        word = re_word.sub('', word)
        if stemming_language:
            word = lower_index_term(word)
            # We remove trailing truncation character before stemming
            if word.endswith('%'):
                word = stem(word[:-1], stemming_language) + '%'
            else:
                word = stem(word, stemming_language)
        if word.find('%') >= 0:  # do we have wildcard in the word?
            query = model.query.filter(model.term.like(wash_index_term(word)))
            if wl > 0:
                query.limit(wl)
            res = query.values('term', 'hitlist')
            # set the limit reached flag to true
            limit_reached = wl > 0 and len(res) == wl
        else:
            res = model.query.filter(model.term.like(
                wash_index_term(word))).values('term', 'hitlist')
    # fill the result set:
    for word, hitlist in res:
        # add the results:
        hitset |= intbitset(hitlist)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset
Esempio n. 10
0
def search_unit_in_idxphrases(p, f, m, wl=0):
    """Searche for phrase 'p' inside idxPHRASE*F table for field 'f'.

    Return hitset of recIDs found. The search type is defined by 'type'
    (e.g. equals to 'r' for a regexp search).
    """
    # call word search method in some cases:
    if f.endswith('count'):
        return search_unit_in_bibwords(p, f, wl=wl)
    # will hold output result set
    hitset = intbitset()
    # flag for knowing if the query limit has been reached
    limit_reached = 0
    # flag for knowing if to limit the query results or not
    use_query_limit = False
    # deduce in which idxPHRASE table we will search:
    model = IdxINDEX.idxPHRASEF(f, fallback=not f)
    if model is None:
        return intbitset()  # phrase index f does not exist

    # detect query type (exact phrase, partial phrase, regexp):
    if m == 'r':
        use_query_limit = True
        column_filter = lambda column: column.op('REGEXP')(p)
    else:
        p = p.replace('*', '%')  # we now use '*' as the truncation character
        ps = p.split("->", 1)  # check for span query:
        if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
            use_query_limit = True
            column_filter = lambda column: column.between(ps[0], ps[1])
        else:
            if p.find('%') > -1:
                use_query_limit = True
                column_filter = lambda column: column.like(p)
            else:
                column_filter = lambda column: column == p

    # special washing for fuzzy author index:
    # if f in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor',
    #          'authorityauthor'):
    #    query_params_washed = ()
    #    for query_param in query_params:
    #        query_params_washed += (wash_author_name(query_param),)
    #    query_params = query_params_washed

    query = model.query.filter(column_filter(model.term))
    # perform search:
    if use_query_limit and wl > 0:
        query = query.limit(wl)

    results = query.values('hitlist')
    limit_reached = use_query_limit and wl > 0 and len(results) == wl
    # fill the result set:
    for row in results:
        hitset |= intbitset(row[0])
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset
Esempio n. 11
0
def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' inside idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer
    )
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append(
                (column.between(pairs_left[-1], pairs_right[-1]), True)
            )
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [term for term in termlist
                        if term.lower().find(p.lower()) > -1]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()
Esempio n. 12
0
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0):
    """Search for 'word' inside bibwordsX table for field 'f'.

    :return: hitset of recIDs.
    """
    from invenio.legacy.bibindex.engine_stemmer import stem
    from invenio.legacy.bibindex.engine_washer import (
        lower_index_term,
        wash_index_term,
    )
    # FIXME: Should not be used for journal field.
    hitset = intbitset()  # will hold output result set
    limit_reached = 0  # flag for knowing if the query limit has been reached

    # if no field is specified, search in the global index.
    f = f or 'anyfield'
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return hitset
    model = index.wordf
    stemming_language = index.stemming_language

    # wash 'word' argument and run query:
    if f.endswith('count') and word.endswith('+'):
        # field count query of the form N+ so transform N+ to N->99999:
        word = word[:-1] + '->99999'
    word = word.replace('*', '%')  # we now use '*' as the truncation character
    words = word.split("->", 1)  # check for span query
    if len(words) == 2:
        word0 = re_word.sub('', words[0])
        word1 = re_word.sub('', words[1])
        if stemming_language:
            word0 = lower_index_term(word0)
            word1 = lower_index_term(word1)
            # We remove trailing truncation character before stemming
            if word0.endswith('%'):
                word0 = stem(word0[:-1], stemming_language) + '%'
            else:
                word0 = stem(word0, stemming_language)
            if word1.endswith('%'):
                word1 = stem(word1[:-1], stemming_language) + '%'
            else:
                word1 = stem(word1, stemming_language)

        word0_washed = wash_index_term(word0)
        word1_washed = wash_index_term(word1)
        if f.endswith('count'):
            # field count query; convert to integers in order
            # to have numerical behaviour for 'BETWEEN n1 AND n2' query
            try:
                word0_washed = int(word0_washed)
                word1_washed = int(word1_washed)
            except ValueError:
                pass
        query = model.query.filter(
            model.term.between(word0_washed, word1_washed)
        )
        if wl > 0:
            query = query.limit(wl)
        res = query.values('term', 'hitlist')
        if wl > 0 and len(res) == wl:
            limit_reached = 1  # set the limit reached flag to true
    else:
        word = re_word.sub('', word)
        if stemming_language:
            word = lower_index_term(word)
            # We remove trailing truncation character before stemming
            if word.endswith('%'):
                word = stem(word[:-1], stemming_language) + '%'
            else:
                word = stem(word, stemming_language)
        if word.find('%') >= 0:  # do we have wildcard in the word?
            query = model.query.filter(model.term.like(wash_index_term(word)))
            if wl > 0:
                query.limit(wl)
            res = query.values('term', 'hitlist')
            # set the limit reached flag to true
            limit_reached = wl > 0 and len(res) == wl
        else:
            res = model.query.filter(
                model.term.like(wash_index_term(word))
            ).values('term', 'hitlist')
    # fill the result set:
    for word, hitlist in res:
        # add the results:
        hitset |= intbitset(hitlist)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset