def fill_self_cites_tables(config): """ This will fill the self-cites tables with data The purpose of this function is to fill these tables on a website that never ran the self-cites daemon """ algorithm = config['algorithm'] tags = get_authors_tags() all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')] citations_fun = get_citations_fun(algorithm) write_message('using %s' % citations_fun.__name__) if algorithm == 'friends': # We only needs this table for the friends algorithm or assimilated # Fill intermediary tables for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'intermediate %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() update_self_cites_tables(recid, config, tags) # Fill self-cites table for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'final %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() compute_and_store_self_citations(recid, tags, citations_fun)
def test_compute_friends_self_citations(self): from invenio.bibrank_selfcites_indexer import \ compute_friends_self_citations from invenio.bibrank_selfcites_indexer import get_authors_tags tags = get_authors_tags() ret = compute_friends_self_citations(1, tags) self.assertEqual(ret, set())
def test_get_authors_tags(self): """test_get_authors_tags We don't care about the value since it's customizable but verify that it doesn't error """ from invenio.bibrank_selfcites_indexer import get_authors_tags tags = get_authors_tags() self.assertEqual(len(tags), 4)
def test_get_author_coauthors_list(self): from invenio.bibrank_selfcites_indexer import get_author_coauthors_list from invenio.bibrank_selfcites_indexer import get_authors_from_record from invenio.bibrank_selfcites_indexer import get_authors_tags tags = get_authors_tags() config = {'friends_threshold': 3} authors = get_authors_from_record(1, tags) self.assert_(get_author_coauthors_list(authors, config))
def test_process_one(self): from invenio.bibrank_selfcites_indexer import get_authors_tags from invenio.bibrank_selfcites_task import process_one from invenio.bibrank_selfcites_task import get_citations_fun from invenio.bibrank_selfcites_indexer import ALL_ALGORITHMS tags = get_authors_tags() for algorithm in ALL_ALGORITHMS: citation_fun = get_citations_fun(algorithm=algorithm) process_one(1, tags, citation_fun)
def test_compute_and_store_self_citations(self): from invenio.bibrank_selfcites_indexer import get_authors_tags from invenio.bibrank_selfcites_task import compute_and_store_self_citations from invenio.bibrank_selfcites_task import get_citations_fun from invenio.bibrank_selfcites_indexer import ALL_ALGORITHMS tags = get_authors_tags() for algorithm in ALL_ALGORITHMS: citation_fun = get_citations_fun(algorithm=algorithm) compute_and_store_self_citations(1, tags, citation_fun)
def test_get_authors_from_record(self): from invenio.bibrank_selfcites_indexer import get_authors_from_record from invenio.bibrank_selfcites_indexer import get_authors_tags from invenio.config import CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID old_config = CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID tags = get_authors_tags() CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID = 0 self.assert_(get_authors_from_record(1, tags)) CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID = 1 get_authors_from_record(1, tags) CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID = old_config
def test_store_record(self): from invenio.bibrank_selfcites_indexer import store_record from invenio.bibrank_selfcites_indexer import get_authors_from_record from invenio.bibrank_selfcites_indexer import get_authors_tags from invenio.dbquery import run_sql tags = get_authors_tags() recid = 1 authors = get_authors_from_record(recid, tags) sql = 'DELETE FROM rnkRECORDSCACHE WHERE id_bibrec = %s' run_sql(sql, (recid,)) store_record(recid, authors) sql = 'SELECT count(*) FROM rnkRECORDSCACHE WHERE id_bibrec = %s' count = run_sql(sql, (recid,))[0][0] self.assert_(count)
def test_store_record(self): from invenio.bibrank_selfcites_indexer import store_record from invenio.bibrank_selfcites_indexer import get_authors_from_record from invenio.bibrank_selfcites_indexer import get_authors_tags from invenio.dbquery import run_sql tags = get_authors_tags() recid = 1 authors = get_authors_from_record(recid, tags) sql = 'DELETE FROM rnkRECORDSCACHE WHERE id_bibrec = %s' run_sql(sql, (recid, )) store_record(recid, authors) sql = 'SELECT count(*) FROM rnkRECORDSCACHE WHERE id_bibrec = %s' count = run_sql(sql, (recid, ))[0][0] self.assert_(count)
def test_store_record_coauthors_with_none_deleted(self): from invenio.bibrank_selfcites_indexer import store_record_coauthors from invenio.bibrank_selfcites_indexer import get_authors_from_record from invenio.bibrank_selfcites_indexer import get_authors_tags from invenio.dbquery import run_sql tags = get_authors_tags() recid = 1 config = {'friends_threshold': 3} authors = get_authors_from_record(recid, tags) sql = 'DELETE FROM rnkEXTENDEDAUTHORS WHERE id = %s' run_sql(sql, (recid, )) store_record_coauthors(recid, authors, [], authors, config) sql = 'SELECT count(*) FROM rnkEXTENDEDAUTHORS WHERE id = %s' count = run_sql(sql, (recid, ))[0][0] self.assert_(count)
def test_store_record_coauthors_with_none_deleted(self): from invenio.bibrank_selfcites_indexer import store_record_coauthors from invenio.bibrank_selfcites_indexer import get_authors_from_record from invenio.bibrank_selfcites_indexer import get_authors_tags from invenio.dbquery import run_sql tags = get_authors_tags() recid = 1 config = {'friends_threshold': 3} authors = get_authors_from_record(recid, tags) sql = 'DELETE FROM rnkEXTENDEDAUTHORS WHERE id = %s' run_sql(sql, (recid,)) store_record_coauthors(recid, authors, [], authors, config) sql = 'SELECT count(*) FROM rnkEXTENDEDAUTHORS WHERE id = %s' count = run_sql(sql, (recid,))[0][0] self.assert_(count)
def fill_self_cites_tables(rank_method_code, config): """ This will fill the self-cites tables with data The purpose of this function is to fill these tables on a website that never ran the self-cites daemon This is an optimization when running on empty tables, and we hope the result is the same as the compute_and_store_self_citations. """ begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") algorithm = config['algorithm'] tags = get_authors_tags() selfcites_dic = {} all_ids = intbitset(run_sql('SELECT id FROM bibrec ORDER BY id')) citations_fun = get_citations_fun(algorithm) write_message('using %s' % citations_fun.__name__) if algorithm == 'friends': # We only needs this table for the friends algorithm or assimilated # Fill intermediary tables for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'intermediate %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() update_self_cites_tables(recid, config, tags) # Fill self-cites table for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'final %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() compute_and_store_self_citations(recid, tags, citations_fun, selfcites_dic) intoDB(selfcites_dic, begin_date, rank_method_code) store_weights_cache(selfcites_dic)
def process_updates(rank_method_code): """ This is what gets executed first when the task is started. It handles the --rebuild option. If that option is not specified we fall back to the process_one() """ write_message("Running rank method: %s" % rank_method_code, verbose=0) selfcites_config = read_configuration(rank_method_code) config = { 'algorithm': selfcites_config.get(rank_method_code, "algorithm"), 'friends_threshold': selfcites_config.get(rank_method_code, "friends_threshold") } quick = task_get_option("quick") != "no" if not quick: return rebuild_tables(rank_method_code, config) tags = get_authors_tags() recids, end_date = fetch_concerned_records(rank_method_code, task_get_option("id")) citations_fun = get_citations_fun(config['algorithm']) weights = fromDB(rank_method_code) write_message("recids %s" % str(recids)) total = len(recids) for count, recid in enumerate(recids): task_sleep_now_if_required(can_stop_too=True) msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total) task_update_progress(msg) write_message(msg) process_one(recid, tags, citations_fun, weights) intoDB(weights, end_date, rank_method_code) store_weights_cache(weights) write_message("Complete") return True
def process_updates(rank_method_code): """ This is what gets executed first when the task is started. It handles the --rebuild option. If that option is not specified we fall back to the process_one() """ selfcites_config = read_configuration(rank_method_code) config = { 'algorithm': selfcites_config.get(rank_method_code, "algorithm"), 'friends_threshold': selfcites_config.get(rank_method_code, "friends_threshold") } begin_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") quick = task_get_option("quick") != "no" if not quick: return rebuild_tables(config) write_message("Starting") tags = get_authors_tags() recids = fetch_concerned_records(rank_method_code) citations_fun = get_citations_fun(config['algorithm']) write_message("recids %s" % str(recids)) total = len(recids) for count, recid in enumerate(recids): task_sleep_now_if_required(can_stop_too=True) msg = "Extracting for %s (%d/%d)" % (recid, count + 1, total) task_update_progress(msg) write_message(msg) process_one(recid, tags, citations_fun) store_last_updated(rank_method_code, begin_date) write_message("Complete") return True
def test_update_self_cites_tables(self): from invenio.bibrank_selfcites_indexer import update_self_cites_tables from invenio.bibrank_selfcites_indexer import get_authors_tags tags = get_authors_tags() config = {} update_self_cites_tables(1, config, tags)
def test_get_collaborations_from_record(self): from invenio.bibrank_selfcites_indexer import get_collaborations_from_record from invenio.bibrank_selfcites_indexer import get_authors_tags tags = get_authors_tags() self.assert_(not get_collaborations_from_record(1, tags))