def fill(): alldicts = {} from invenio.legacy.bibrank.tag_based_indexer import fromDB serialized_weights = cache.get("citations_weights") if serialized_weights: weights = deserialize_via_marshal(serialized_weights) else: weights = fromDB("citation") alldicts["citations_weights"] = weights # for cited:M->N queries, it is interesting to cache also # some preprocessed citationdict: alldicts["citations_keys"] = intbitset(weights.keys()) # Citation counts alldicts["citations_counts"] = [t for t in iteritems(weights)] alldicts["citations_counts"].sort(key=itemgetter(1), reverse=True) # Self-cites serialized_weights = cache.get("selfcites_weights") if serialized_weights: selfcites = deserialize_via_marshal(serialized_weights) else: selfcites = fromDB("selfcites") selfcites_weights = {} for recid, counts in alldicts["citations_counts"]: selfcites_weights[recid] = counts - selfcites.get(recid, 0) alldicts["selfcites_weights"] = selfcites_weights alldicts["selfcites_counts"] = [ (recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts["citations_counts"] ] alldicts["selfcites_counts"].sort(key=itemgetter(1), reverse=True) return alldicts
def fill(): alldicts = {} from invenio.legacy.bibrank.tag_based_indexer import fromDB serialized_weights = cache.get('citations_weights') if serialized_weights: weights = deserialize_via_marshal(serialized_weights) else: weights = fromDB('citation') alldicts['citations_weights'] = weights # for cited:M->N queries, it is interesting to cache also # some preprocessed citationdict: alldicts['citations_keys'] = intbitset(weights.keys()) # Citation counts alldicts['citations_counts'] = [t for t in iteritems(weights)] alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True) # Self-cites serialized_weights = cache.get('selfcites_weights') if serialized_weights: selfcites = deserialize_via_marshal(serialized_weights) else: selfcites = fromDB('selfcites') selfcites_weights = {} for recid, counts in alldicts['citations_counts']: selfcites_weights[recid] = counts - selfcites.get(recid, 0) alldicts['selfcites_weights'] = selfcites_weights alldicts['selfcites_counts'] = [(recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts['citations_counts']] alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True) return alldicts
def rank_method_stat(rank_method_code, reclist, lwords): """Shows some statistics about the searchresult. rank_method_code - name field from rnkMETHOD reclist - a list of sorted and ranked records lwords - the words in the query""" voutput = "" if len(reclist) > 20: j = 20 else: j = len(reclist) voutput += "<br />Rank statistics:<br />" for i in range(1, j + 1): voutput += "%s,Recid:%s,Score:%s<br />" % (i, reclist[len(reclist) - i][0], reclist[len(reclist) - i][1]) for (term, table) in lwords: term_recs = run_sql("""SELECT hitlist FROM %s WHERE term=%%s""" % table, (term,)) if term_recs: term_recs = deserialize_via_marshal(term_recs[0][0]) if reclist[len(reclist) - i][0] in term_recs: voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]]) voutput += "<br />" voutput += "<br />Score variation:<br />" count = {} for i in range(0, len(reclist)): count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1 i = 100 while i >= 0: if i in count: voutput += "%s-%s<br />" % (i, count[i]) i -= 1
def rank_method_stat(rank_method_code, reclist, lwords): """Shows some statistics about the searchresult. rank_method_code - name field from rnkMETHOD reclist - a list of sorted and ranked records lwords - the words in the query""" voutput = "" if len(reclist) > 20: j = 20 else: j = len(reclist) voutput += "<br />Rank statistics:<br />" for i in range(1, j + 1): voutput += "%s,Recid:%s,Score:%s<br />" % ( i, reclist[len(reclist) - i][0], reclist[len(reclist) - i][1]) for (term, table) in lwords: term_recs = run_sql( """SELECT hitlist FROM %s WHERE term=%%s""" % table, (term, )) if term_recs: term_recs = deserialize_via_marshal(term_recs[0][0]) if reclist[len(reclist) - i][0] in term_recs: voutput += "%s-%s / " % ( term, term_recs[reclist[len(reclist) - i][0]]) voutput += "<br />" voutput += "<br />Score variation:<br />" count = {} for i in range(0, len(reclist)): count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1 i = 100 while i >= 0: if i in count: voutput += "%s-%s<br />" % (i, count[i]) i -= 1
def fromDB(rank_method_code): """Get the data for a rank method""" id = run_sql("""SELECT id from "rnkMETHOD" where name=%s""", (rank_method_code, )) res = run_sql("""SELECT relevance_data FROM "rnkMETHODDATA" WHERE "id_rnkMETHOD"=%s""", (id[0][0], )) if res: return deserialize_via_marshal(res[0][0]) else: return {}
def fromDB(rank_method_code): """Get the data for a rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) res = run_sql( "SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: return deserialize_via_marshal(res[0][0]) else: return {}
def get_uid_based_on_pref(prefname, prefvalue): """get the user's UID based where his/her preference prefname has value prefvalue in preferences""" prefs = run_sql("SELECT id, settings FROM user WHERE settings is not NULL") the_uid = None for pref in prefs: try: settings = deserialize_via_marshal(pref[1]) if (prefname in settings) and (settings[prefname] == prefvalue): the_uid = pref[0] except: pass return the_uid
def get_uid_based_on_pref(prefname, prefvalue): """get the user's UID based where his/her preference prefname has value prefvalue in preferences""" prefs = run_sql("""SELECT id, settings FROM "user" WHERE settings is not NULL""") the_uid = None for pref in prefs: try: settings = deserialize_via_marshal(pref[1]) if (prefname in settings) and (settings[prefname] == prefvalue): the_uid = pref[0] except: pass return the_uid
def do_upgrade(): """do upgrade.""" rows_to_change = run_sql( "SELECT id, arguments FROM oaiHARVEST", with_dict=True) # Move away from old columns for row in rows_to_change: if row['arguments']: arguments = deserialize_via_marshal(row['arguments']) if "c_cfg-file" in arguments: arguments['c_stylesheet'] = arguments['c_cfg-file'] del arguments['c_cfg-file'] run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id']))
def do_upgrade(): """do upgrade.""" rows_to_change = run_sql("SELECT id, arguments FROM oaiHARVEST", with_dict=True) # Move away from old columns for row in rows_to_change: if row['arguments']: arguments = deserialize_via_marshal(row['arguments']) if "c_cfg-file" in arguments: arguments['c_stylesheet'] = arguments['c_cfg-file'] del arguments['c_cfg-file'] run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id']))
def get_data_for_definition_rnk(method_name, rnk_name): '''Returns the dictionary with data for method_name ranking method''' try: res = run_sql('SELECT d.relevance_data \ from "rnkMETHODDATA" d, "rnkMETHOD" r WHERE \ d."id_rnkMETHOD" = r.id AND \ r.name = %s', (rnk_name, )) if res and res[0]: write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \ %method_name, verbose=5) return deserialize_via_marshal(res[0][0]) except Error as err: write_message("No data could be found for sorting method %s. " \ "The following errror occured: [%s]" \ %(method_name, err), stream=sys.stderr) return {}
def del_recids(rank_method_code, range_rec): """Delete some records from the rank method""" id = run_sql("""SELECT id from "rnkMETHOD" where name=%s""", (rank_method_code, )) res = run_sql("""SELECT relevance_data FROM "rnkMETHODDATA" WHERE "id_rnkMETHOD"=%s""", (id[0][0], )) if res: rec_dict = deserialize_via_marshal(res[0][0]) write_message("Old size: %s" % len(rec_dict)) for (recids, recide) in range_rec: for i in range(int(recids), int(recide)): if i in rec_dict: del rec_dict[i] write_message("New size: %s" % len(rec_dict)) begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) intoDB(rec_dict, begin_date, rank_method_code) else: write_message("Create before deleting!")
def get_data_for_definition_rnk(method_name, rnk_name): '''Returns the dictionary with data for method_name ranking method''' try: res = run_sql( 'SELECT d.relevance_data \ from rnkMETHODDATA d, rnkMETHOD r WHERE \ d.id_rnkMETHOD = r.id AND \ r.name = %s', (rnk_name, )) if res and res[0]: write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \ %method_name, verbose=5) return deserialize_via_marshal(res[0][0]) except Error as err: write_message("No data could be found for sorting method %s. " \ "The following errror occured: [%s]" \ %(method_name, err), stream=sys.stderr) return {}
def del_recids(rank_method_code, range_rec): """Delete some records from the rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) res = run_sql( "SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: rec_dict = deserialize_via_marshal(res[0][0]) write_message("Old size: %s" % len(rec_dict)) for (recids, recide) in range_rec: for i in range(int(recids), int(recide)): if i in rec_dict: del rec_dict[i] write_message("New size: %s" % len(rec_dict)) begin_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) intoDB(rec_dict, begin_date, rank_method_code) else: write_message("Create before deleting!")
def do_upgrade(): """do upgrade.""" create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1] if '`arguments` text' in create_statement: run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob") # translate old values if '`bibconvertcfgfile`' in create_statement: rows_to_change = run_sql( """SELECT id, bibconvertcfgfile, bibfilterprogram, arguments FROM oaiHARVEST""", with_dict=True) # Move away from old columns for row in rows_to_change: if row['arguments']: arguments = deserialize_via_marshal(row['arguments']) else: arguments = {} arguments['c_cfg-file'] = row['bibconvertcfgfile'] arguments['f_filter-file'] = row['bibfilterprogram'] run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id'])) run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile") run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code lwords_old = lwords lwords = [] #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if not methods[rank_method_code]["stopwords"] == "True" or methods[ rank_method_code]["stopwords"] and not is_stopword(term): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split( string.lower( re.sub( methods[rank_method_code] ["chars_alphanumericseparators"], ' ', term))) for term in terms: if "stemmer" in methods[rank_method_code]: # stem word term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"]) if lwords_old[ i] != term: #add if stemmed word is different than original word lwords.append( (term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) #For each term, if accepted, get a list of the records using the term #calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql( """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term, )) if term_recs: #if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance( (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return ( None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput) else: #sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) #Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) #using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist #using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % ( methods[rank_method_code]["col_size"]) voutput += "Number of terms: %s<br />" % run_sql( "SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % ( str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance, verbose): """Ranking of records based on predetermined values. input: rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from rnkMETHODDATA lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" rnkdict = run_sql( "SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s", (rank_method_code, )) if not rnkdict: return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput) max_recid = 0 res = run_sql("SELECT max(id) FROM bibrec") if res and res[0][0]: max_recid = int(res[0][0]) lwords_hitset = None for j in range( 0, len(lwords) ): #find which docs to search based on ranges..should be done in search_engine... if lwords[j] and lwords[j][:6] == "recid:": if not lwords_hitset: lwords_hitset = intbitset() lword = lwords[j][6:] if lword.find("->") > -1: lword = lword.split("->") if int(lword[0]) >= max_recid or int( lword[1]) >= max_recid + 1: return (None, "Warning: Given record IDs are out of range.", "", voutput) for i in range(int(lword[0]), int(lword[1])): lwords_hitset.add(int(i)) elif lword < max_recid + 1: lwords_hitset.add(int(lword)) else: return (None, "Warning: Given record IDs are out of range.", "", voutput) rnkdict = deserialize_via_marshal(rnkdict[0][0]) if verbose > 0: voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code voutput += "Ranking data loaded, size of structure: %s<br />" % len( rnkdict) lrecIDs = list(hitset) if verbose > 0: voutput += "Number of records to rank: %s<br />" % len(lrecIDs) reclist = [] reclist_addend = [] if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop? for recID in lrecIDs: if recID in rnkdict: reclist.append((recID, rnkdict[recID])) del rnkdict[recID] else: reclist_addend.append((recID, 0)) else: #rank docs in hitset, can this be speed up using something else than for loop? for recID in lwords_hitset: if recID in rnkdict and recID in hitset: reclist.append((recID, rnkdict[recID])) del rnkdict[recID] elif recID in hitset: reclist_addend.append((recID, 0)) if verbose > 0: voutput += "Number of records ranked: %s<br />" % len(reclist) voutput += "Number of records not ranked: %s<br />" % len( reclist_addend) reclist.sort(lambda x, y: cmp(x[1], y[1])) return (reclist_addend + reclist, METHODS[rank_method_code]["prefix"], METHODS[rank_method_code]["postfix"], voutput)
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' in idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace("*", "%") # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(" ") or ps[1].startswith(" ")): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and len(pairs_right) > 1 and pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True)) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find("%") > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = "xxxxxxxxxx" # hopefuly this will not clash with anything in the future p = p.replace("%", replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, "%") conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg["CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH"] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value(model.termlist) if res: termlist = deserialize_via_marshal(res) if not [term for term in termlist if term.lower().find(p.lower()) > -1]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def update_bibsort_tables(recids, method, update_timestamp = True): """Updates the data structures for sorting method: method for the records in recids""" res = run_sql("""SELECT id, definition, washer from "bsrMETHOD" where name = %s""", (method, )) if res and res[0]: method_id = res[0][0] definition = res[0][1] washer = res[0][2] else: write_message('No sorting method called %s could be found ' \ 'in bsrMETHOD table.' %method, sys.stderr) return False res = run_sql("""SELECT data_dict, data_dict_ordered, data_list_sorted FROM "bsrMETHODDATA" where "id_bsrMETHOD" = %s""", (method_id, )) if res and res[0]: data_dict = deserialize_via_marshal(res[0][0]) data_dict_ordered = {} data_list_sorted = [] else: write_message('No data could be found for the sorting method %s.' \ %method) return False #since this case should have been treated earlier #get the values for the recids that need to be recalculated field_data = get_field_data(recids, method, definition) if not field_data: write_message("Possible error: the method %s has no data for records %s." \ %(method, str(recids))) else: apply_washer(field_data, washer) #if a recid is not in field_data that is because no value was found for it #so it should be marked for deletion recids_to_delete = list(recids.difference(intbitset(field_data.keys()))) recids_to_insert = [] recids_to_modify = {} for recid in field_data: if recid in data_dict: if data_dict[recid] != field_data[recid]: #we store the old value recids_to_modify[recid] = data_dict[recid] else: # recid is new, and needs to be inserted recids_to_insert.append(recid) #remove the recids that were not previously in bibsort recids_to_delete = [recid for recid in recids_to_delete if recid in data_dict] #dicts to keep the ordered values for the recids - useful bor bucket insertion recids_current_ordered = {} recids_old_ordered = {} if recids_to_insert or recids_to_modify or recids_to_delete: data_dict_ordered = deserialize_via_marshal(res[0][1]) data_list_sorted = deserialize_via_marshal(res[0][2]) if recids_to_modify: write_message("%s records have been modified." \ %len(recids_to_modify), verbose=5) for recid in recids_to_modify: recids_old_ordered[recid] = data_dict_ordered[recid] perform_modify_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_insert: write_message("%s records have been inserted." \ %len(recids_to_insert), verbose=5) for recid in recids_to_insert: perform_insert_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_delete: write_message("%s records have been deleted." \ %len(recids_to_delete), verbose=5) for recid in recids_to_delete: perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid) for recid in recids_to_modify: recids_current_ordered[recid] = data_dict_ordered[recid] for recid in recids_to_insert: recids_current_ordered[recid] = data_dict_ordered[recid] #write the modifications to db executed = write_to_methoddata_table(method_id, data_dict, \ data_dict_ordered, data_list_sorted, update_timestamp) if not executed: return False #update buckets try: perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp) except Error as err: write_message("[%s] The bucket data for method %s has not been updated" \ %(method, err), sys.stderr) return False return True
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' in idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import ( BibIndexDefaultTokenizer) # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace('*', '%') # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and \ len(pairs_right) > 1 and \ pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True)) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find('%') > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = 'xxxxxxxxxx' # hopefuly this will not clash with anything in the future p = p.replace('%', replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, '%') conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value( model.termlist) if res: termlist = deserialize_via_marshal(res) if not [ term for term in termlist if term.lower().find(p.lower()) > -1 ]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def ordered(self): """Return deserialized orderd dict.""" return deserialize_via_marshal(self.data_dict_ordered)
def find_similar(rank_method_code, recID, hitset, rank_limit_relevance, verbose, methods): """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance, input: rank_method_code - the code of the method, from the name field in rnkMETHOD recID - records to use for find similar hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" startCreate = time.time() global voutput voutput = "" if verbose > 0: voutput += ( "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code ) rank_limit_relevance = methods[rank_method_code]["default_min_relevance"] try: recID = int(recID) except Exception as e: return (None, "Warning: Error in record ID, please check that a number is given.", "", voutput) rec_terms = run_sql( """SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1], (recID,) ) if not rec_terms: return (None, "Warning: Requested record does not seem to exist.", "", voutput) rec_terms = deserialize_via_marshal(rec_terms[0][0]) # Get all documents using terms from the selected documents if len(rec_terms) == 0: return (None, "Warning: Record specified has no content indexed for use with this method.", "", voutput) else: terms = "%s" % rec_terms.keys() terms_recs = dict( run_sql( """SELECT term, hitlist FROM %s WHERE term IN (%s)""" % (methods[rank_method_code]["rnkWORD_table"], terms[1 : len(terms) - 1]) ) ) tf_values = {} # Calculate all term frequencies for (term, tf) in iteritems(rec_terms): if len(term) >= methods[rank_method_code]["min_word_length"] and term in terms_recs and tf[1] != 0: tf_values[term] = int((1 + math.log(tf[0])) * tf[1]) # calculate term weigth tf_values = tf_values.items() tf_values.sort(lambda x, y: cmp(y[1], x[1])) # sort based on weigth lwords = [] stime = time.time() (recdict, rec_termcount) = ({}, {}) for (t, tf) in tf_values: # t=term, tf=term frequency term_recs = deserialize_via_marshal(terms_recs[t]) if len(tf_values) <= methods[rank_method_code]["max_nr_words_lower"] or ( len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and ( ( (float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <= methods[rank_method_code]["max_word_occurence"] ) and ( (float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"] ) ) ): # too complicated...something must be done lwords.append((t, methods[rank_method_code]["rnkWORD_table"])) # list of terms used (recdict, rec_termcount) = calculate_record_relevance_findsimilar( (t, round(tf, 4)), term_recs, hitset, recdict, rec_termcount, verbose, "true" ) # true tells the function to not calculate all unimportant terms if len(tf_values) > methods[rank_method_code]["max_nr_words_lower"] and ( len(lwords) == methods[rank_method_code]["max_nr_words_upper"] or tf < 0 ): break if len(recdict) == 0 or len(lwords) == 0: return (None, "Could not find similar documents for this query.", "", voutput) else: # sort if we got something to sort (reclist, hitset) = sort_record_relevance_findsimilar( recdict, rec_termcount, hitset, rank_limit_relevance, verbose ) if verbose > 0: voutput += ( "<br />Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] ) voutput += "Number of terms to use for query: %s<br />" % len(lwords) voutput += "Terms: %s<br />" % lwords voutput += "Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += "Prepare time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += ( "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code ) lwords_old = lwords lwords = [] # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if ( not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term) ): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split( string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term)) ) for term in terms: if "stemmer" in methods[rank_method_code]: # stem word term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"]) if lwords_old[i] != term: # add if stemmed word is different than original word lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) # For each term, if accepted, get a list of the records using the term # calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql( """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,) ) if term_recs: # if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance( (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None ) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return ( None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput, ) else: # sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) # Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) # using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist # using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += ( "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] ) voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance, verbose): """Ranking of records based on predetermined values. input: rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from rnkMETHODDATA lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" rnkdict = run_sql("SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s", (rank_method_code,)) if not rnkdict: return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput) max_recid = 0 res = run_sql("SELECT max(id) FROM bibrec") if res and res[0][0]: max_recid = int(res[0][0]) lwords_hitset = None for j in range(0, len(lwords)): #find which docs to search based on ranges..should be done in search_engine... if lwords[j] and lwords[j][:6] == "recid:": if not lwords_hitset: lwords_hitset = intbitset() lword = lwords[j][6:] if lword.find("->") > -1: lword = lword.split("->") if int(lword[0]) >= max_recid or int(lword[1]) >= max_recid + 1: return (None, "Warning: Given record IDs are out of range.", "", voutput) for i in range(int(lword[0]), int(lword[1])): lwords_hitset.add(int(i)) elif lword < max_recid + 1: lwords_hitset.add(int(lword)) else: return (None, "Warning: Given record IDs are out of range.", "", voutput) rnkdict = deserialize_via_marshal(rnkdict[0][0]) if verbose > 0: voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code voutput += "Ranking data loaded, size of structure: %s<br />" % len(rnkdict) lrecIDs = list(hitset) if verbose > 0: voutput += "Number of records to rank: %s<br />" % len(lrecIDs) reclist = [] reclist_addend = [] if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop? for recID in lrecIDs: if recID in rnkdict: reclist.append((recID, rnkdict[recID])) del rnkdict[recID] else: reclist_addend.append((recID, 0)) else: #rank docs in hitset, can this be speed up using something else than for loop? for recID in lwords_hitset: if recID in rnkdict and recID in hitset: reclist.append((recID, rnkdict[recID])) del rnkdict[recID] elif recID in hitset: reclist_addend.append((recID, 0)) if verbose > 0: voutput += "Number of records ranked: %s<br />" % len(reclist) voutput += "Number of records not ranked: %s<br />" % len(reclist_addend) reclist.sort(lambda x, y: cmp(x[1], y[1])) return (reclist_addend + reclist, METHODS[rank_method_code]["prefix"], METHODS[rank_method_code]["postfix"], voutput)
def find_similar(rank_method_code, recID, hitset, rank_limit_relevance, verbose, methods): """Finding terms to use for calculating similarity. Terms are taken from the recid given, returns a list of recids's and relevance, input: rank_method_code - the code of the method, from the name field in rnkMETHOD recID - records to use for find similar hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" startCreate = time.time() global voutput voutput = "" if verbose > 0: voutput += "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code rank_limit_relevance = methods[rank_method_code]["default_min_relevance"] try: recID = int(recID) except Exception as e: return ( None, "Warning: Error in record ID, please check that a number is given.", "", voutput) rec_terms = run_sql( """SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1], (recID, )) if not rec_terms: return (None, "Warning: Requested record does not seem to exist.", "", voutput) rec_terms = deserialize_via_marshal(rec_terms[0][0]) #Get all documents using terms from the selected documents if len(rec_terms) == 0: return ( None, "Warning: Record specified has no content indexed for use with this method.", "", voutput) else: terms = "%s" % rec_terms.keys() terms_recs = dict( run_sql("""SELECT term, hitlist FROM %s WHERE term IN (%s)""" % (methods[rank_method_code]["rnkWORD_table"], terms[1:len(terms) - 1]))) tf_values = {} #Calculate all term frequencies for (term, tf) in iteritems(rec_terms): if len(term) >= methods[rank_method_code][ "min_word_length"] and term in terms_recs and tf[1] != 0: tf_values[term] = int( (1 + math.log(tf[0])) * tf[1]) #calculate term weigth tf_values = tf_values.items() tf_values.sort(lambda x, y: cmp(y[1], x[1])) #sort based on weigth lwords = [] stime = time.time() (recdict, rec_termcount) = ({}, {}) for (t, tf) in tf_values: #t=term, tf=term frequency term_recs = deserialize_via_marshal(terms_recs[t]) if len(tf_values ) <= methods[rank_method_code]["max_nr_words_lower"] or ( len(term_recs) >= methods[rank_method_code]["min_nr_words_docs"] and (((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) <= methods[rank_method_code]["max_word_occurence"]) and ((float(len(term_recs)) / float(methods[rank_method_code]["col_size"])) >= methods[rank_method_code]["min_word_occurence"])) ): #too complicated...something must be done lwords.append((t, methods[rank_method_code]["rnkWORD_table"] )) #list of terms used (recdict, rec_termcount) = calculate_record_relevance_findsimilar( (t, round(tf, 4)), term_recs, hitset, recdict, rec_termcount, verbose, "true" ) #true tells the function to not calculate all unimportant terms if len(tf_values ) > methods[rank_method_code]["max_nr_words_lower"] and ( len(lwords) == methods[rank_method_code]["max_nr_words_upper"] or tf < 0): break if len(recdict) == 0 or len(lwords) == 0: return (None, "Could not find similar documents for this query.", "", voutput) else: #sort if we got something to sort (reclist, hitset) = sort_record_relevance_findsimilar( recdict, rec_termcount, hitset, rank_limit_relevance, verbose) if verbose > 0: voutput += "<br />Number of terms: %s<br />" % run_sql( "SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] voutput += "Number of terms to use for query: %s<br />" % len(lwords) voutput += "Terms: %s<br />" % lwords voutput += "Current number of recIDs: %s<br />" % ( methods[rank_method_code]["col_size"]) voutput += "Prepare time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def update_bibsort_tables(recids, method, update_timestamp=True): """Updates the data structures for sorting method: method for the records in recids""" res = run_sql( "SELECT id, definition, washer \ from bsrMETHOD where name = %s", (method, )) if res and res[0]: method_id = res[0][0] definition = res[0][1] washer = res[0][2] else: write_message('No sorting method called %s could be found ' \ 'in bsrMETHOD table.' %method, sys.stderr) return False res = run_sql( "SELECT data_dict, data_dict_ordered, data_list_sorted \ FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, )) if res and res[0]: data_dict = deserialize_via_marshal(res[0][0]) data_dict_ordered = {} data_list_sorted = [] else: write_message('No data could be found for the sorting method %s.' \ %method) return False #since this case should have been treated earlier #get the values for the recids that need to be recalculated field_data = get_field_data(recids, method, definition) if not field_data: write_message("Possible error: the method %s has no data for records %s." \ %(method, str(recids))) else: apply_washer(field_data, washer) #if a recid is not in field_data that is because no value was found for it #so it should be marked for deletion recids_to_delete = list(recids.difference(intbitset(field_data.keys()))) recids_to_insert = [] recids_to_modify = {} for recid in field_data: if recid in data_dict: if data_dict[recid] != field_data[recid]: #we store the old value recids_to_modify[recid] = data_dict[recid] else: # recid is new, and needs to be inserted recids_to_insert.append(recid) #remove the recids that were not previously in bibsort recids_to_delete = [ recid for recid in recids_to_delete if recid in data_dict ] #dicts to keep the ordered values for the recids - useful bor bucket insertion recids_current_ordered = {} recids_old_ordered = {} if recids_to_insert or recids_to_modify or recids_to_delete: data_dict_ordered = deserialize_via_marshal(res[0][1]) data_list_sorted = deserialize_via_marshal(res[0][2]) if recids_to_modify: write_message("%s records have been modified." \ %len(recids_to_modify), verbose=5) for recid in recids_to_modify: recids_old_ordered[recid] = data_dict_ordered[recid] perform_modify_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_insert: write_message("%s records have been inserted." \ %len(recids_to_insert), verbose=5) for recid in recids_to_insert: perform_insert_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_delete: write_message("%s records have been deleted." \ %len(recids_to_delete), verbose=5) for recid in recids_to_delete: perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid) for recid in recids_to_modify: recids_current_ordered[recid] = data_dict_ordered[recid] for recid in recids_to_insert: recids_current_ordered[recid] = data_dict_ordered[recid] #write the modifications to db executed = write_to_methoddata_table(method_id, data_dict, \ data_dict_ordered, data_list_sorted, update_timestamp) if not executed: return False #update buckets try: perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp) except Error as err: write_message("[%s] The bucket data for method %s has not been updated" \ %(method, err), sys.stderr) return False return True