def default_search_unit(p, f, m, wl): """Query correct index type and return hitset.""" if m == 'a' or m == 'r' or is_marc_tag(f): # we are doing either phrase search or regexp search index_id = IdxINDEX.get_index_id_from_field(f) if index_id != 0: if m == 'a' and index_id in IdxINDEX.get_idxpair_field_ids(): # for exact match on the admin configured fields # we are searching in the pair tables hitset = search_unit_in_idxpairs(p, f, m or 'a', wl) else: hitset = search_unit_in_idxphrases(p, f, m or 'a', wl) else: hitset = search_unit_in_bibxxx(p, f, m or 'a', wl) else: # we are doing bibwords search by default hitset = search_unit_in_bibwords(p, f, wl=wl) return hitset
def autocomplete(field, q): """Autocomplete data from indexes. It uses POSTed arguments with name `q` that has to be longer than 3 characters in order to returns any results. :param field: index name :param q: query string for index term :return: list of values matching query. """ IdxPHRASE = IdxINDEX.idxPHRASEF(field, fallback=False) results = IdxPHRASE.query.filter( IdxPHRASE.term.contains(q)).limit(20).values('term') results = map(lambda r: {'value': r[0]}, results) return jsonify(results=results)
def _index_id(self): return IdxINDEX.get_from_field('collection').id
def search_unit_in_idxphrases(p, f, m, wl=0): """Searche for phrase 'p' inside idxPHRASE*F table for field 'f'. Return hitset of recIDs found. The search type is defined by 'type' (e.g. equals to 'r' for a regexp search). """ # call word search method in some cases: if f.endswith('count'): return search_unit_in_bibwords(p, f, wl=wl) # will hold output result set hitset = intbitset() # flag for knowing if the query limit has been reached limit_reached = 0 # flag for knowing if to limit the query results or not use_query_limit = False # deduce in which idxPHRASE table we will search: model = IdxINDEX.idxPHRASEF(f, fallback=not f) if model is None: return intbitset() # phrase index f does not exist # detect query type (exact phrase, partial phrase, regexp): if m == 'r': use_query_limit = True column_filter = lambda column: column.op('REGEXP')(p) else: p = p.replace('*', '%') # we now use '*' as the truncation character ps = p.split("->", 1) # check for span query: if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): use_query_limit = True column_filter = lambda column: column.between(ps[0], ps[1]) else: if p.find('%') > -1: use_query_limit = True column_filter = lambda column: column.like(p) else: column_filter = lambda column: column == p # special washing for fuzzy author index: # if f in ('author', 'firstauthor', 'exactauthor', 'exactfirstauthor', # 'authorityauthor'): # query_params_washed = () # for query_param in query_params: # query_params_washed += (wash_author_name(query_param),) # query_params = query_params_washed query = model.query.filter(column_filter(model.term)) # perform search: if use_query_limit and wl > 0: query = query.limit(wl) results = query.values('hitlist') limit_reached = use_query_limit and wl > 0 and len(results) == wl # fill the result set: for row in results: hitset |= intbitset(row[0]) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(hitset) # okay, return result set: return hitset
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' in idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import ( BibIndexDefaultTokenizer) # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace('*', '%') # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and \ len(pairs_right) > 1 and \ pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True)) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find('%') > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = 'xxxxxxxxxx' # hopefuly this will not clash with anything in the future p = p.replace('%', replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, '%') conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value( model.termlist) if res: termlist = deserialize_via_marshal(res) if not [ term for term in termlist if term.lower().find(p.lower()) > -1 ]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0): """Search for 'word' inside bibwordsX table for field 'f'. :return: hitset of recIDs. """ from invenio.legacy.bibindex.engine_stemmer import stem from invenio.legacy.bibindex.engine_washer import ( lower_index_term, wash_index_term, ) # FIXME: Should not be used for journal field. hitset = intbitset() # will hold output result set limit_reached = 0 # flag for knowing if the query limit has been reached # if no field is specified, search in the global index. f = f or 'anyfield' index = IdxINDEX.get_from_field(f) if index is None: return hitset model = index.wordf stemming_language = index.stemming_language # wash 'word' argument and run query: if f.endswith('count') and word.endswith('+'): # field count query of the form N+ so transform N+ to N->99999: word = word[:-1] + '->99999' word = word.replace('*', '%') # we now use '*' as the truncation character words = word.split("->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) # We remove trailing truncation character before stemming if word0.endswith('%'): word0 = stem(word0[:-1], stemming_language) + '%' else: word0 = stem(word0, stemming_language) if word1.endswith('%'): word1 = stem(word1[:-1], stemming_language) + '%' else: word1 = stem(word1, stemming_language) word0_washed = wash_index_term(word0) word1_washed = wash_index_term(word1) if f.endswith('count'): # field count query; convert to integers in order # to have numerical behaviour for 'BETWEEN n1 AND n2' query try: word0_washed = int(word0_washed) word1_washed = int(word1_washed) except ValueError: pass query = model.query.filter( model.term.between(word0_washed, word1_washed)) if wl > 0: query = query.limit(wl) res = query.values('term', 'hitlist') if wl > 0 and len(res) == wl: limit_reached = 1 # set the limit reached flag to true else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) # We remove trailing truncation character before stemming if word.endswith('%'): word = stem(word[:-1], stemming_language) + '%' else: word = stem(word, stemming_language) if word.find('%') >= 0: # do we have wildcard in the word? query = model.query.filter(model.term.like(wash_index_term(word))) if wl > 0: query.limit(wl) res = query.values('term', 'hitlist') # set the limit reached flag to true limit_reached = wl > 0 and len(res) == wl else: res = model.query.filter(model.term.like( wash_index_term(word))).values('term', 'hitlist') # fill the result set: for word, hitlist in res: # add the results: hitset |= intbitset(hitlist) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(hitset) # okay, return result set: return hitset
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' inside idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import ( BibIndexDefaultTokenizer ) # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace('*', '%') # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and \ len(pairs_right) > 1 and \ pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append( (column.between(pairs_left[-1], pairs_right[-1]), True) ) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find('%') > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = 'xxxxxxxxxx' # hopefuly this will not clash with anything in the future p = p.replace('%', replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, '%') conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value( model.termlist) if res: termlist = deserialize_via_marshal(res) if not [term for term in termlist if term.lower().find(p.lower()) > -1]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0): """Search for 'word' inside bibwordsX table for field 'f'. :return: hitset of recIDs. """ from invenio.legacy.bibindex.engine_stemmer import stem from invenio.legacy.bibindex.engine_washer import ( lower_index_term, wash_index_term, ) # FIXME: Should not be used for journal field. hitset = intbitset() # will hold output result set limit_reached = 0 # flag for knowing if the query limit has been reached # if no field is specified, search in the global index. f = f or 'anyfield' index = IdxINDEX.get_from_field(f) if index is None: return hitset model = index.wordf stemming_language = index.stemming_language # wash 'word' argument and run query: if f.endswith('count') and word.endswith('+'): # field count query of the form N+ so transform N+ to N->99999: word = word[:-1] + '->99999' word = word.replace('*', '%') # we now use '*' as the truncation character words = word.split("->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) # We remove trailing truncation character before stemming if word0.endswith('%'): word0 = stem(word0[:-1], stemming_language) + '%' else: word0 = stem(word0, stemming_language) if word1.endswith('%'): word1 = stem(word1[:-1], stemming_language) + '%' else: word1 = stem(word1, stemming_language) word0_washed = wash_index_term(word0) word1_washed = wash_index_term(word1) if f.endswith('count'): # field count query; convert to integers in order # to have numerical behaviour for 'BETWEEN n1 AND n2' query try: word0_washed = int(word0_washed) word1_washed = int(word1_washed) except ValueError: pass query = model.query.filter( model.term.between(word0_washed, word1_washed) ) if wl > 0: query = query.limit(wl) res = query.values('term', 'hitlist') if wl > 0 and len(res) == wl: limit_reached = 1 # set the limit reached flag to true else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) # We remove trailing truncation character before stemming if word.endswith('%'): word = stem(word[:-1], stemming_language) + '%' else: word = stem(word, stemming_language) if word.find('%') >= 0: # do we have wildcard in the word? query = model.query.filter(model.term.like(wash_index_term(word))) if wl > 0: query.limit(wl) res = query.values('term', 'hitlist') # set the limit reached flag to true limit_reached = wl > 0 and len(res) == wl else: res = model.query.filter( model.term.like(wash_index_term(word)) ).values('term', 'hitlist') # fill the result set: for word, hitlist in res: # add the results: hitset |= intbitset(hitlist) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(hitset) # okay, return result set: return hitset