def last_updated_result(rank_method_code): """ return the last value of dictionary in rnkMETHODDATA table if it exists and initialize the value of last updated records by zero, otherwise an initial dictionary with zero as value for all recids """ result = [{}, {}, {}] query = """select relevance_data from rnkMETHOD, rnkMETHODDATA where rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD and rnkMETHOD.Name = '%s'""" % rank_method_code rdict = run_sql(query) if rdict and rdict[0] and rdict[0][0]: #has to be prepared for corrupted data! try: dic = deserialize_via_marshal(rdict[0][0]) except zlib.error: return [{}, {}, {}] query = "select object_value from rnkCITATIONDATA where object_name='citationdict'" cit_compressed = run_sql(query) cit = [] if cit_compressed and cit_compressed[0] and cit_compressed[0][0]: cit = deserialize_via_marshal(cit_compressed[0][0]) if cit: query = """select object_value from rnkCITATIONDATA where object_name='reversedict'""" ref_compressed = run_sql(query) if ref_compressed and ref_compressed[0] and ref_compressed[0][ 0]: ref = deserialize_via_marshal(ref_compressed[0][0]) result = (dic, cit, ref) return result
def fill(): alldicts = {} from invenio.bibrank_tag_based_indexer import fromDB redis = get_redis() serialized_weights = redis.get('citations_weights') if serialized_weights: weights = deserialize_via_marshal(serialized_weights) else: weights = fromDB('citation') alldicts['citations_weights'] = weights # for cited:M->N queries, it is interesting to cache also # some preprocessed citationdict: alldicts['citations_keys'] = intbitset(weights.keys()) # Citation counts alldicts['citations_counts'] = [t for t in weights.iteritems()] alldicts['citations_counts'].sort(key=itemgetter(1), reverse=True) # Self-cites serialized_weights = redis.get('selfcites_weights') if serialized_weights: selfcites = deserialize_via_marshal(serialized_weights) else: selfcites = fromDB('selfcites') selfcites_weights = {} for recid, counts in alldicts['citations_counts']: selfcites_weights[recid] = counts - selfcites.get(recid, 0) alldicts['selfcites_weights'] = selfcites_weights alldicts['selfcites_counts'] = [(recid, selfcites_weights.get(recid, cites)) for recid, cites in alldicts['citations_counts']] alldicts['selfcites_counts'].sort(key=itemgetter(1), reverse=True) return alldicts
def last_updated_result(rank_method_code): """ return the last value of dictionary in rnkMETHODDATA table if it exists and initialize the value of last updated records by zero, otherwise an initial dictionary with zero as value for all recids """ result = [{}, {}, {}] query = """select relevance_data from rnkMETHOD, rnkMETHODDATA where rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD and rnkMETHOD.Name = '%s'"""% rank_method_code rdict = run_sql(query) if rdict and rdict[0] and rdict[0][0]: #has to be prepared for corrupted data! try: dic = deserialize_via_marshal(rdict[0][0]) except zlib.error: return [{}, {}, {}] query = "select object_value from rnkCITATIONDATA where object_name='citationdict'" cit_compressed = run_sql(query) cit = [] if cit_compressed and cit_compressed[0] and cit_compressed[0][0]: cit = deserialize_via_marshal(cit_compressed[0][0]) if cit: query = """select object_value from rnkCITATIONDATA where object_name='reversedict'""" ref_compressed = run_sql(query) if ref_compressed and ref_compressed[0] and ref_compressed[0][0]: ref = deserialize_via_marshal(ref_compressed[0][0]) result = (dic, cit, ref) return result
def test_filetypes_of_records(self): """tests files extensions of record 1 and 77""" query1 = "SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=1" \ % get_index_id_from_index_name('filetype') query2 = "SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=77" \ % get_index_id_from_index_name('filetype') res1 = run_sql(query1) res2 = run_sql(query2) set1 = deserialize_via_marshal(res1[0][0]) set2 = deserialize_via_marshal(res2[0][0]) self.assertEqual(set1, ['gif', 'jpg']) self.assertEqual(set2, ['pdf', 'ps.gz'])
def rank_method_stat(rank_method_code, reclist, lwords): """Shows some statistics about the searchresult. rank_method_code - name field from rnkMETHOD reclist - a list of sorted and ranked records lwords - the words in the query""" voutput = "" if len(reclist) > 20: j = 20 else: j = len(reclist) voutput += "<br />Rank statistics:<br />" for i in range(1, j + 1): voutput += "%s,Recid:%s,Score:%s<br />" % (i, reclist[len(reclist) - i][0], reclist[len(reclist) - i][1]) for (term, table) in lwords: term_recs = run_sql("""SELECT hitlist FROM %s WHERE term=%%s""" % table, (term,)) if term_recs: term_recs = deserialize_via_marshal(term_recs[0][0]) if term_recs.has_key(reclist[len(reclist) - i][0]): voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]]) voutput += "<br />" voutput += "<br />Score variation:<br />" count = {} for i in range(0, len(reclist)): count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1 i = 100 while i >= 0: if count.has_key(i): voutput += "%s-%s<br />" % (i, count[i]) i -= 1
def rank_method_stat(rank_method_code, reclist, lwords): """Shows some statistics about the searchresult. rank_method_code - name field from rnkMETHOD reclist - a list of sorted and ranked records lwords - the words in the query""" voutput = "" if len(reclist) > 20: j = 20 else: j = len(reclist) voutput += "<br />Rank statistics:<br />" for i in range(1, j + 1): voutput += "%s,Recid:%s,Score:%s<br />" % (i,reclist[len(reclist) - i][0],reclist[len(reclist) - i][1]) for (term, table) in lwords: term_recs = run_sql("""SELECT hitlist FROM %s WHERE term=%%s""" % table, (term,)) if term_recs: term_recs = deserialize_via_marshal(term_recs[0][0]) if term_recs.has_key(reclist[len(reclist) - i][0]): voutput += "%s-%s / " % (term, term_recs[reclist[len(reclist) - i][0]]) voutput += "<br />" voutput += "<br />Score variation:<br />" count = {} for i in range(0, len(reclist)): count[reclist[i][1]] = count.get(reclist[i][1], 0) + 1 i = 100 while i >= 0: if count.has_key(i): voutput += "%s-%s<br />" % (i, count[i]) i -= 1
def get_user_preferences(uid): pref = run_sql("SELECT id, settings FROM user WHERE id=%s", (uid, )) if pref: try: return deserialize_via_marshal(pref[0][1]) except: pass return get_default_user_preferences() # empty dict mean no preferences
def test_splliting_and_indexing_CJK_characters_reversed_table(self): """CJK Tokenizer - comparing terms for record with chinese poetry in title index, reverse table""" query = "SELECT * from test_idxWORD%02dR where id_bibrec='104'" % get_index_id_from_index_name('title') res = run_sql(query) iset = [] if res: iset = deserialize_via_marshal(res[0][1]) self.assertEqual(iset, ['\xe6\x95\xac', '\xe7\x8d\xa8', '\xe4\xba\xad', '\xe5\x9d\x90'])
def fromDB(rank_method_code): """Get the data for a rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: return deserialize_via_marshal(res[0][0]) else: return {}
def get_user_preferences(uid): pref = run_sql("SELECT id, settings FROM user WHERE id=%s", (uid,)) if pref: try: return deserialize_via_marshal(pref[0][1]) except: pass return get_default_user_preferences() # empty dict mean no preferences
def process_result_value(self, value, dialect): if value is not None: try: value = deserialize_via_marshal(value) except: value = None return value if value is not None else \ (self.default_value() if callable(self.default_value) else self.default_value)
def get_cit_dict(name): """get a named citation dict from the db""" cdict = run_sql("""SELECT object_value FROM rnkCITATIONDATA WHERE object_name = %s""", (name, )) if cdict and cdict[0] and cdict[0][0]: dict_from_db = deserialize_via_marshal(cdict[0][0]) else: dict_from_db = {} return dict_from_db
def do_upgrade(): rows_to_change = run_sql("SELECT id, arguments FROM oaiHARVEST", with_dict=True) # Move away from old columns for row in rows_to_change: if row['arguments']: arguments = deserialize_via_marshal(row['arguments']) if "c_cfg-file" in arguments: arguments['c_stylesheet'] = arguments['c_cfg-file'] del arguments['c_cfg-file'] run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id']))
def get_initial_author_dict(): """read author->citedinlist dict from the db""" adict = {} try: ah = run_sql("select aterm,hitlist from rnkAUTHORDATA") for (a, h) in ah: adict[a] = deserialize_via_marshal(h) return adict except: register_exception(prefix="could not read rnkAUTHORDATA", alert_admin=True) return {}
def get_uid_based_on_pref(prefname, prefvalue): """get the user's UID based where his/her preference prefname has value prefvalue in preferences""" prefs = run_sql("SELECT id, settings FROM user WHERE settings is not NULL") the_uid = None for pref in prefs: try: settings = deserialize_via_marshal(pref[1]) if (settings.has_key(prefname)) and (settings[prefname] == prefvalue): the_uid = pref[0] except: pass return the_uid
def get_author_cited_by(authorstring): """Return a list of doc ids [y1,y2,..] for the author given as param, such that y1,y2.. cite that author """ citations = [] res = run_sql("select hitlist from rnkAUTHORDATA where aterm=%s", (authorstring,)) if res and res[0] and res[0][0]: # has to be prepared for corrupted data! try: citations = deserialize_via_marshal(res[0][0]) except: citations = [] return citations
def get_cit_dict(name): """get a named citation dict from the db""" cdict = {} try: cdict = run_sql("select object_value from rnkCITATIONDATA where object_name = %s", (name,)) if cdict and cdict[0] and cdict[0][0]: dict_from_db = deserialize_via_marshal(cdict[0][0]) return dict_from_db else: return {} except: register_exception(prefix="could not read "+name+" from db", alert_admin=True) return dict
def get_author_cited_by(authorstring): """Return a list of doc ids [y1,y2,..] for the author given as param, such that y1,y2.. cite that author """ citations = [] res = run_sql("select hitlist from rnkAUTHORDATA where aterm=%s", (authorstring, )) if res and res[0] and res[0][0]: #has to be prepared for corrupted data! try: citations = deserialize_via_marshal(res[0][0]) except: citations = [] return citations
def test_authority_record_enriched_index(self): """bibindex - test whether reverse index for bibliographic record contains words from referenced authority records""" bibRecID = 9 authority_string = 'jonathan' index_name = 'author' table = "idxWORD%02dR" % get_index_id_from_index_name(index_name) reindex_for_type_with_bibsched(index_name) self.assertTrue( authority_string in deserialize_via_marshal( run_sql("SELECT termlist FROM %s WHERE id_bibrec = %s" % (table, bibRecID))[0][0] ) )
def del_recids(rank_method_code, range_rec): """Delete some records from the rank method""" id = run_sql("SELECT id from rnkMETHOD where name=%s", (rank_method_code, )) res = run_sql("SELECT relevance_data FROM rnkMETHODDATA WHERE id_rnkMETHOD=%s", (id[0][0], )) if res: rec_dict = deserialize_via_marshal(res[0][0]) write_message("Old size: %s" % len(rec_dict)) for (recids, recide) in range_rec: for i in range(int(recids), int(recide)): if rec_dict.has_key(i): del rec_dict[i] write_message("New size: %s" % len(rec_dict)) intoDB(rec_dict, begin_date, rank_method_code) else: write_message("Create before deleting!")
def get_data_for_definition_rnk(method_name, rnk_name): '''Returns the dictionary with data for method_name ranking method''' try: res = run_sql('SELECT d.relevance_data \ from rnkMETHODDATA d, rnkMETHOD r WHERE \ d.id_rnkMETHOD = r.id AND \ r.name = %s', (rnk_name, )) if res and res[0]: write_message('Data extracted from table rnkMETHODDATA for sorting method %s' \ %method_name, verbose=5) return deserialize_via_marshal(res[0][0]) except Error, err: write_message("No data could be found for sorting method %s. " \ "The following errror occured: [%s]" \ %(method_name, err), stream=sys.stderr) return {}
def last_updated_result(rank_method_code): """ return the last value of dictionary in rnkMETHODDATA table if it exists and initialize the value of last updated records by zero, otherwise an initial dictionary with zero as value for all recids """ query = """SELECT relevance_data FROM rnkMETHOD, rnkMETHODDATA WHERE rnkMETHOD.id = rnkMETHODDATA.id_rnkMETHOD AND rnkMETHOD.Name = '%s'""" % rank_method_code try: rdict = run_sql(query)[0][0] except IndexError: dic = {} else: dic = deserialize_via_marshal(rdict) return dic
def do_upgrade(): create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1] if '`arguments` text' in create_statement: run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob") # translate old values if '`bibconvertcfgfile`' in create_statement: rows_to_change = run_sql("SELECT id, bibconvertcfgfile, bibfilterprogram, arguments FROM oaiHARVEST", with_dict=True) # Move away from old columns for row in rows_to_change: if row['arguments']: arguments = deserialize_via_marshal(row['arguments']) else: arguments = {} arguments['c_cfg-file'] = row['bibconvertcfgfile'] arguments['f_filter-file'] = row['bibfilterprogram'] run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id'])) run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile") run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
def get_citations_from_db(): """gets the citation data (who cites who) from the rnkCITATIONDATA table, and returns: -a dictionary of type x:{x1,x2..}, where x is cited by x1,x2.. -a dict of type a:{b} where recid 'a' is asociated with an index 'b'""" dict_of_ids = {} count = 0 query = "select object_value from rnkCITATIONDATA \ where object_name = 'citationdict'" cit_compressed = run_sql(query) cit = [] if cit_compressed and cit_compressed[0] and cit_compressed[0][0]: cit = deserialize_via_marshal(cit_compressed[0][0]) if cit: for item in cit: #check for duplicates in citation dictionary cit[item] = set(cit[item]) if item in cit[item]: cit[item].remove(item) if item not in dict_of_ids: dict_of_ids[item] = count count += 1 for value in cit[item]: if value not in dict_of_ids: dict_of_ids[value] = count count += 1 write_message("Citation data collected\ from rnkCITATIONDATA", verbose=2) write_message("Ids and recids corespondace: %s" \ % str(dict_of_ids), verbose=9) write_message("Citations: %s" % str(cit), verbose=9) return cit, dict_of_ids else: write_message("Error while extracting citation data \ from rnkCITATIONDATA table", verbose=1) else: write_message("Error while extracting citation data \ from rnkCITATIONDATA table", verbose=1) return {}, {}
def cache_filler(): alldicts = {} try: res = run_sql("SELECT object_name,object_value FROM rnkCITATIONDATA") except OperationalError: # database problems, return empty cache return {} for row in res: object_name = row[0] object_value = row[1] try: object_value_dict = deserialize_via_marshal(object_value) except: object_value_dict = {} alldicts[object_name] = object_value_dict if object_name == 'citationdict': # for cited:M->N queries, it is interesting to cache also # some preprocessed citationdict: alldicts['citationdict_keys'] = object_value_dict.keys() alldicts['citationdict_keys_intbitset'] = intbitset(object_value_dict.keys()) return alldicts
def do_upgrade(): create_statement = run_sql('SHOW CREATE TABLE oaiHARVEST')[0][1] if '`arguments` text' in create_statement: run_sql("ALTER TABLE oaiHARVEST CHANGE arguments arguments blob") # translate old values if '`bibconvertcfgfile`' in create_statement: rows_to_change = run_sql( "SELECT id, bibconvertcfgfile, bibfilterprogram, arguments FROM oaiHARVEST", with_dict=True) # Move away from old columns for row in rows_to_change: if row['arguments']: arguments = deserialize_via_marshal(row['arguments']) else: arguments = {} arguments['c_cfg-file'] = row['bibconvertcfgfile'] arguments['f_filter-file'] = row['bibfilterprogram'] run_sql("UPDATE oaiHARVEST set arguments=%s WHERE id=%s", (serialize_via_marshal(arguments), row['id'])) run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibconvertcfgfile") run_sql("ALTER TABLE oaiHARVEST DROP COLUMN bibfilterprogram")
def _get_users_invenio12(*args, **kwargs): """Get user accounts Invenio 1.""" from invenio.dbquery import run_sql, deserialize_via_marshal User = namedtuple('User', [ 'id', 'email', 'password', 'password_salt', 'note', 'full_name', 'settings', 'nickname', 'last_login' ]) users = run_sql( 'SELECT id, email, password, note, settings, nickname, last_login' ' FROM user', run_on_slave=True) return len(users), [ User( id=user[0], email=user[1], password=user[2].decode('latin1'), password_salt=user[1], note=user[3], full_name=user[5], settings=deserialize_via_marshal(user[4]) if user[4] else {}, # we don't have proper nicknames on Invenio v1 nickname='id_{0}'.format(user[0]), last_login=user[6]) for user in users ]
def test_records_for_number_of_copies_record32(self): """checks content of itemcount index for record: 32""" query = "SELECT termlist FROM idxWORD%02dR WHERE id_bibrec=32" \ % get_index_id_from_index_name('itemcount') res = run_sql(query) self.assertEqual(deserialize_via_marshal(res[0][0]),['3'])
def update_bibsort_tables(recids, method, update_timestamp = True): """Updates the data structures for sorting method: method for the records in recids""" res = run_sql("SELECT id, definition, washer \ from bsrMETHOD where name = %s", (method, )) if res and res[0]: method_id = res[0][0] definition = res[0][1] washer = res[0][2] else: write_message('No sorting method called %s could be found ' \ 'in bsrMETHOD table.' %method, sys.stderr) return False res = run_sql("SELECT data_dict, data_dict_ordered, data_list_sorted \ FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, )) if res and res[0]: data_dict = deserialize_via_marshal(res[0][0]) data_dict_ordered = {} data_list_sorted = [] else: write_message('No data could be found for the sorting method %s.' \ %method) return False #since this case should have been treated earlier #get the values for the recids that need to be recalculated field_data = get_field_data(recids, method, definition) if not field_data: write_message("Possible error: the method %s has no data for records %s." \ %(method, str(recids))) else: apply_washer(field_data, washer) #if a recid is not in field_data that is because no value was found for it #so it should be marked for deletion recids_to_delete = list(recids.difference(intbitset(field_data.keys()))) recids_to_insert = [] recids_to_modify = {} for recid in field_data: if recid in data_dict: if data_dict[recid] != field_data[recid]: #we store the old value recids_to_modify[recid] = data_dict[recid] else: # recid is new, and needs to be inserted recids_to_insert.append(recid) #remove the recids that were not previously in bibsort recids_to_delete = [recid for recid in recids_to_delete if recid in data_dict] #dicts to keep the ordered values for the recids - useful bor bucket insertion recids_current_ordered = {} recids_old_ordered = {} if recids_to_insert or recids_to_modify or recids_to_delete: data_dict_ordered = deserialize_via_marshal(res[0][1]) data_list_sorted = deserialize_via_marshal(res[0][2]) if recids_to_modify: write_message("%s records have been modified." \ %len(recids_to_modify), verbose=5) for recid in recids_to_modify: recids_old_ordered[recid] = data_dict_ordered[recid] perform_modify_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_insert: write_message("%s records have been inserted." \ %len(recids_to_insert), verbose=5) for recid in recids_to_insert: perform_insert_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_delete: write_message("%s records have been deleted." \ %len(recids_to_delete), verbose=5) for recid in recids_to_delete: perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid) for recid in recids_to_modify: recids_current_ordered[recid] = data_dict_ordered[recid] for recid in recids_to_insert: recids_current_ordered[recid] = data_dict_ordered[recid] #write the modifications to db executed = write_to_methoddata_table(method_id, data_dict, \ data_dict_ordered, data_list_sorted, update_timestamp) if not executed: return False #update buckets try: perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp) except Error, err: write_message("[%s] The bucket data for method %s has not been updated" \ %(method, err), sys.stderr) return False
for p in ptrs: pdata[p] = {} pdata[p]['tickets'] = run_sql("select * from aidPERSONID where tag like 'rt%%' and personid = %s", (p,)) pdata[p]['cid'] = run_sql("select data from aidPERSONID where tag = 'canonical_name' and personid = %s", (p,)) sm = serialize_via_marshal(pdata) fp = open('pdata2.dat', 'w') fp.write(sm) fp.close() ### THEN ON TEST from invenio.dbquery import run_sql, deserialize_via_marshal, serialize_via_marshal fp = open('pdata.dat', 'r') dm = deserialize_via_marshal(fp.read()) fp.close() for d in dm: dm[d]['tickets'] = list(dm[d]['tickets']) for i, t in enumerate(dm[d]['tickets']): dm[d]['tickets'][i] = list(t) for pp in dm: cname = dm[pp]['cid'][0][0] dbs = run_sql('select personid from aidPERSONID where tag = "canonical_name" and data = %s', (cname,)) pid = dbs[0][0] dbp = run_sql("select data from aidPERSONID where tag = 'paper' and personid = %s", (pid,)) refs = [j[3] for j in dm[pp]['tickets'] if j[2]=='rt_confirm' or j[2]=='rt_repeal'] common_refs = set(refs).intersection(set([i[0] for i in dbp])) for t in dm[pp]['tickets']: if t[3] in common_refs or not t[2] in ('rt_confirm', 'rt_repeal'):
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance, verbose): """Ranking of records based on predetermined values. input: rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from rnkMETHODDATA lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" global voutput voutput = "" rnkdict = run_sql( "SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s", (rank_method_code, )) if not rnkdict: return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput) max_recid = 0 res = run_sql("SELECT max(id) FROM bibrec") if res and res[0][0]: max_recid = int(res[0][0]) lwords_hitset = None for j in range( 0, len(lwords) ): #find which docs to search based on ranges..should be done in search_engine... if lwords[j] and lwords[j][:6] == "recid:": if not lwords_hitset: lwords_hitset = intbitset() lword = lwords[j][6:] if string.find(lword, "->") > -1: lword = string.split(lword, "->") if int(lword[0]) >= max_recid or int( lword[1]) >= max_recid + 1: return (None, "Warning: Given record IDs are out of range.", "", voutput) for i in range(int(lword[0]), int(lword[1])): lwords_hitset.add(int(i)) elif lword < max_recid + 1: lwords_hitset.add(int(lword)) else: return (None, "Warning: Given record IDs are out of range.", "", voutput) rnkdict = deserialize_via_marshal(rnkdict[0][0]) if verbose > 0: voutput += "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code voutput += "Ranking data loaded, size of structure: %s<br />" % len( rnkdict) lrecIDs = list(hitset) if verbose > 0: voutput += "Number of records to rank: %s<br />" % len(lrecIDs) reclist = [] reclist_addend = [] if not lwords_hitset: #rank all docs, can this be speed up using something else than for loop? for recID in lrecIDs: if rnkdict.has_key(recID): reclist.append((recID, rnkdict[recID])) del rnkdict[recID] else: reclist_addend.append((recID, 0)) else: #rank docs in hitset, can this be speed up using something else than for loop? for recID in lwords_hitset: if rnkdict.has_key(recID) and recID in hitset: reclist.append((recID, rnkdict[recID])) del rnkdict[recID] elif recID in hitset: reclist_addend.append((recID, 0)) if verbose > 0: voutput += "Number of records ranked: %s<br />" % len(reclist) voutput += "Number of records not ranked: %s<br />" % len( reclist_addend) reclist.sort(lambda x, y: cmp(x[1], y[1])) return (reclist_addend + reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def rank_by_method(rank_method_code, lwords, hitset, rank_limit_relevance, verbose): """Ranking of records based on predetermined values. input: rank_method_code - the code of the method, from the name field in rnkMETHOD, used to get predetermined values from rnkMETHODDATA lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records, with unsorted added to the end: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" rnkdict = run_sql( "SELECT relevance_data FROM rnkMETHODDATA,rnkMETHOD where rnkMETHOD.id=id_rnkMETHOD and rnkMETHOD.name=%s", (rank_method_code,), ) if not rnkdict: return (None, "Warning: Could not load ranking data for method %s." % rank_method_code, "", voutput) max_recid = 0 res = run_sql("SELECT max(id) FROM bibrec") if res and res[0][0]: max_recid = int(res[0][0]) lwords_hitset = None for j in range(0, len(lwords)): # find which docs to search based on ranges..should be done in search_engine... if lwords[j] and lwords[j][:6] == "recid:": if not lwords_hitset: lwords_hitset = intbitset() lword = lwords[j][6:] if lword.find("->") > -1: lword = lword.split("->") if int(lword[0]) >= max_recid or int(lword[1]) >= max_recid + 1: return (None, "Warning: Given record IDs are out of range.", "", voutput) for i in range(int(lword[0]), int(lword[1])): lwords_hitset.add(int(i)) elif lword < max_recid + 1: lwords_hitset.add(int(lword)) else: return (None, "Warning: Given record IDs are out of range.", "", voutput) rnkdict = deserialize_via_marshal(rnkdict[0][0]) if verbose > 0: voutput += ( "<br />Running rank method: %s, using rank_by_method function in bibrank_record_sorter<br />" % rank_method_code ) voutput += "Ranking data loaded, size of structure: %s<br />" % len(rnkdict) lrecIDs = list(hitset) if verbose > 0: voutput += "Number of records to rank: %s<br />" % len(lrecIDs) reclist = [] reclist_addend = [] if not lwords_hitset: # rank all docs, can this be speed up using something else than for loop? for recID in lrecIDs: if recID in rnkdict: reclist.append((recID, rnkdict[recID])) del rnkdict[recID] else: reclist_addend.append((recID, 0)) else: # rank docs in hitset, can this be speed up using something else than for loop? for recID in lwords_hitset: if recID in rnkdict and recID in hitset: reclist.append((recID, rnkdict[recID])) del rnkdict[recID] elif recID in hitset: reclist_addend.append((recID, 0)) if verbose > 0: voutput += "Number of records ranked: %s<br />" % len(reclist) voutput += "Number of records not ranked: %s<br />" % len(reclist_addend) reclist.sort(lambda x, y: cmp(x[1], y[1])) return ( reclist_addend + reclist, METHODS[rank_method_code]["prefix"], METHODS[rank_method_code]["postfix"], voutput, )
global voutput voutput = "" if verbose > 0: voutput += "<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code rank_limit_relevance = methods[rank_method_code]["default_min_relevance"] try: recID = int(recID) except Exception,e : return (None, "Warning: Error in record ID, please check that a number is given.", "", voutput) rec_terms = run_sql("""SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1], (recID,)) if not rec_terms: return (None, "Warning: Requested record does not seem to exist.", "", voutput) rec_terms = deserialize_via_marshal(rec_terms[0][0]) #Get all documents using terms from the selected documents if len(rec_terms) == 0: return (None, "Warning: Record specified has no content indexed for use with this method.", "", voutput) else: terms = "%s" % rec_terms.keys() terms_recs = dict(run_sql("""SELECT term, hitlist FROM %s WHERE term IN (%s)""" % (methods[rank_method_code]["rnkWORD_table"], terms[1:len(terms) - 1]))) tf_values = {} #Calculate all term frequencies for (term, tf) in rec_terms.iteritems(): if len(term) >= methods[rank_method_code]["min_word_length"] and terms_recs.has_key(term) and tf[1] != 0: tf_values[term] = int((1 + math.log(tf[0])) * tf[1]) #calculate term weigth tf_values = tf_values.items() tf_values.sort(lambda x, y: cmp(y[1], x[1])) #sort based on weigth
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += ( "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code ) lwords_old = lwords lwords = [] # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if ( not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term, 1) ): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split( string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term)) ) for term in terms: if methods[rank_method_code].has_key("stemmer"): # stem word term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"]) if lwords_old[i] != term: # add if stemmed word is different than original word lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) # For each term, if accepted, get a list of the records using the term # calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql( """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,) ) if term_recs: # if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance( (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None ) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return ( None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput, ) else: # sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) # Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) # using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist # using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += ( "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] ) voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
"<br />Running rank method: %s, using find_similar/word_frequency in bibrank_record_sorter<br />" % rank_method_code ) rank_limit_relevance = methods[rank_method_code]["default_min_relevance"] try: recID = int(recID) except Exception, e: return (None, "Warning: Error in record ID, please check that a number is given.", "", voutput) rec_terms = run_sql( """SELECT termlist FROM %sR WHERE id_bibrec=%%s""" % methods[rank_method_code]["rnkWORD_table"][:-1], (recID,) ) if not rec_terms: return (None, "Warning: Requested record does not seem to exist.", "", voutput) rec_terms = deserialize_via_marshal(rec_terms[0][0]) # Get all documents using terms from the selected documents if len(rec_terms) == 0: return (None, "Warning: Record specified has no content indexed for use with this method.", "", voutput) else: terms = "%s" % rec_terms.keys() terms_recs = dict( run_sql( """SELECT term, hitlist FROM %s WHERE term IN (%s)""" % (methods[rank_method_code]["rnkWORD_table"], terms[1 : len(terms) - 1]) ) ) tf_values = {} # Calculate all term frequencies
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code lwords_old = lwords lwords = [] #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term))) for term in terms: if methods[rank_method_code].has_key("stemmer"): # stem word term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"]) if lwords_old[i] != term: #add if stemmed word is different than original word lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) #For each term, if accepted, get a list of the records using the term #calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql("""SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)) if term_recs: #if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput) else: #sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) #Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) #using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist #using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def update_bibsort_tables(recids, method, update_timestamp=True): """Updates the data structures for sorting method: method for the records in recids""" res = run_sql( "SELECT id, definition, washer \ from bsrMETHOD where name = %s", (method, )) if res and res[0]: method_id = res[0][0] definition = res[0][1] washer = res[0][2] else: write_message('No sorting method called %s could be found ' \ 'in bsrMETHOD table.' %method, sys.stderr) return False res = run_sql( "SELECT data_dict, data_dict_ordered, data_list_sorted \ FROM bsrMETHODDATA where id_bsrMETHOD = %s", (method_id, )) if res and res[0]: data_dict = deserialize_via_marshal(res[0][0]) data_dict_ordered = {} data_list_sorted = [] else: write_message('No data could be found for the sorting method %s.' \ %method) return False #since this case should have been treated earlier #get the values for the recids that need to be recalculated field_data = get_field_data(recids, method, definition) if not field_data: write_message("Possible error: the method %s has no data for records %s." \ %(method, str(recids))) else: apply_washer(field_data, washer) #if a recid is not in field_data that is because no value was found for it #so it should be marked for deletion recids_to_delete = list(recids.difference(intbitset(field_data.keys()))) recids_to_insert = [] recids_to_modify = {} for recid in field_data: if recid in data_dict: if data_dict[recid] != field_data[recid]: #we store the old value recids_to_modify[recid] = data_dict[recid] else: # recid is new, and needs to be inserted recids_to_insert.append(recid) #remove the recids that were not previously in bibsort recids_to_delete = [ recid for recid in recids_to_delete if recid in data_dict ] #dicts to keep the ordered values for the recids - useful bor bucket insertion recids_current_ordered = {} recids_old_ordered = {} if recids_to_insert or recids_to_modify or recids_to_delete: data_dict_ordered = deserialize_via_marshal(res[0][1]) data_list_sorted = deserialize_via_marshal(res[0][2]) if recids_to_modify: write_message("%s records have been modified." \ %len(recids_to_modify), verbose=5) for recid in recids_to_modify: recids_old_ordered[recid] = data_dict_ordered[recid] perform_modify_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_insert: write_message("%s records have been inserted." \ %len(recids_to_insert), verbose=5) for recid in recids_to_insert: perform_insert_record(data_dict, data_dict_ordered, \ data_list_sorted, field_data[recid], recid) if recids_to_delete: write_message("%s records have been deleted." \ %len(recids_to_delete), verbose=5) for recid in recids_to_delete: perform_delete_record(data_dict, data_dict_ordered, data_list_sorted, recid) for recid in recids_to_modify: recids_current_ordered[recid] = data_dict_ordered[recid] for recid in recids_to_insert: recids_current_ordered[recid] = data_dict_ordered[recid] #write the modifications to db executed = write_to_methoddata_table(method_id, data_dict, \ data_dict_ordered, data_list_sorted, update_timestamp) if not executed: return False #update buckets try: perform_update_buckets(recids_current_ordered, recids_to_insert, recids_old_ordered, method_id, update_timestamp) except Error, err: write_message("[%s] The bucket data for method %s has not been updated" \ %(method, err), sys.stderr) return False