def test_rabbit_matchable_name(self, mocked_func1, mocked_func2): ''' The return value of the function that creates the matchable name is being mocked. With this test we ensure that the function create_matchable_name is actually being used. ''' mocked_func1.return_value = 'Fake Name' mocked_func2.return_value = mocked_func1.return_value rabbit([self.main_bibrec], verbose=True) first_pid = run_sql( "select personid from aidPERSONIDPAPERS where bibrec=%s", (self.main_bibrec, ))[0][0] second_marcxml_record = get_new_marc_for_test( 'Rabbit Test Paper', author_name=self.heavily_modified_name) second_bibrec = get_bibrec_for_record(second_marcxml_record, opt_mode='insert') self.bibrecs_to_clean.append(second_bibrec) rabbit([second_bibrec], verbose=True) second_pid = run_sql( "select personid from aidPERSONIDPAPERS where bibrec=%s", (second_bibrec, ))[0][0] self.assertEquals(first_pid, second_pid)
def test_rabbit_add_inspireID(): ''' An inspire id is added to to an author artificially. Then, a record is uploaded with a heavily modifield name of the person + the same inspire ID. Despite the fact that the name is totally different, due to the fact that there is an inspire ID in place, the entry shall not change. ''' self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) personid_to_test = get_authors_by_name(self.author_name)[0] #PERSONID_EXTERNAL_IDENTIFIER_MAP.values() TODO add_external_id_to_author(personid_to_test, 'INSPIREID', self.ext_id) self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.heavily_modified_name, ext_id=self.ext_id) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) self.assertEquals( personid_to_test, get_authors_by_name(self.heavily_modified_name)[0]) _remove_external_id_from_author(personid_to_test, 'INSPIREID', self.ext_id)
def test_rabbit_heavily_modify_author(): ''' The author's name is modified heavily. This means, that the modified string is significantly different than the original.. After the run of rabbit, the name in aidPERSONIDDATA SHOULD change, since this is a heavy modification. ''' number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.heavily_modified_name) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) previous_bibref_value = self.current_bibref_value self.current_bibref_value = get_bibref_value_for_name( self.heavily_modified_name) number_of_personids_after = get_count_of_pids() self.assertNotEquals(previous_bibref_value, self.current_bibref_value) self.assertTrue( person_in_aidpersonidpapers(self.heavily_modified_name, self.main_bibrec)) self.assertTrue( person_in_aidpersoniddata(self.heavily_modified_name)) self.assertFalse(person_in_aidpersoniddata(self.author_name)) self.assertFalse( person_in_aidpersonidpapers(self.slightly_modified_author_name, self.main_bibrec)) self.assertEquals(number_of_personids_before, number_of_personids_after)
def test_rabbit_heavily_modify_coauthors(): number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name, co_authors_names=self.heavily_mod_co_authors_names) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) previous_bibref_value_of_author = self.current_bibref_value_of_author previous_bibrefs_of_coauthors = deepcopy( self.current_bibref_values_of_coauthors) for index, _ in enumerate(self.current_bibref_values_of_coauthors): self.current_bibref_values_of_coauthors[ index] = get_bibref_value_for_name( self.heavily_mod_co_authors_names[index]) number_of_personids_after = get_count_of_pids() self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertEquals(previous_bibref_value_of_author, self.current_bibref_value_of_author) self.assertNotEquals(set(previous_bibrefs_of_coauthors), set(self.current_bibref_values_of_coauthors)) self.assertEquals(number_of_personids_after, number_of_personids_before) self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(self.author_name)) for coauthor_name in self.heavily_mod_co_authors_names: self.assertTrue( person_in_aidpersonidpapers(coauthor_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(coauthor_name))
def setUp(self): self._pid = get_free_author_id() self._orcid = '1234-1234-1234-1234' add_orcid_id_to_author(self._pid, self._orcid) marc = get_new_marc_for_test('Orcid test paper', author_name='Author, SomeAuthor', identifiers=['ORCID:' + self._orcid]) self._rec = get_bibrec_for_record(marc, opt_mode="insert") rabbit([self._rec]) populate_partial_marc_caches([self._rec])
def rabbit_with_log(papers, check_invalid_papers, log_comment, partial=False): from invenio.bibauthorid_rabbit import rabbit personids_to_update_extids = _get_personids_to_update_extids(papers) starting_time = get_sql_time() rabbit(papers, check_invalid_papers, personids_to_update_extids) if partial: action = 'PID_UPDATE_PARTIAL' else: action = 'PID_UPDATE' insert_user_log('daemon', '-1', action, 'bibsched', 'status', comment=log_comment, timestamp=starting_time)
def test_rabbit_add_new_paper_with_one_author(): ''' Rabbit gets a record with a new author. Tests whether the author-related tables are populated with the author's name. ''' rabbit([self.main_bibrec], verbose=True) self.current_bibref_value = get_bibref_value_for_name( self.author_name) # saved for following tests self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(self.author_name))
def test_rabbit_mark_record_as_deleted(): number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name, co_authors_names=self.heavily_mod_co_authors_names) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='delete') rabbit([self.main_bibrec], verbose=True) number_of_personids_after = get_count_of_pids() self.assertEquals(number_of_personids_before - 5, number_of_personids_after)
def safe_disambiguation_iteration(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_author_paper_associations \ , duplicated_tortoise_results_exist \ , repair_author_paper_associations if not check_author_paper_associations(): rabbit([]) repair_author_paper_associations() rabbit([]) assert check_author_paper_associations() tortoise() assert duplicated_tortoise_results_exist()
def safe_disambiguation_iteration(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_author_paper_associations \ , duplicated_tortoise_results_exist \ , repair_author_paper_associations if not check_author_paper_associations(): rabbit([]) repair_author_paper_associations() rabbit([]) assert check_author_paper_associations() tortoise() assert duplicated_tortoise_results_exist()
def safe_disambiguation_iteration(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_personid_papers \ , check_results \ , repair_personid if not check_personid_papers(): rabbit([]) repair_personid() rabbit([]) assert check_personid_papers() tortoise() assert check_results()
def safe_disambiguation_iteration(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_personid_papers \ , check_results \ , repair_personid if not check_personid_papers(): rabbit([]) repair_personid() rabbit([]) assert check_personid_papers() tortoise() assert check_results()
def test_rabbit_mark_record_as_deleted(): ''' A record is deleted. Rabbit should understand that and remove the author from the aidPERSON* tables. ''' number_of_personids_before = get_count_of_pids() if config.CFG_INSPIRE_SITE: self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.heavily_modified_name, ext_id=self.ext_id) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='delete') rabbit([self.main_bibrec], verbose=True) number_of_personids_after = get_count_of_pids() self.assertEquals(number_of_personids_before - 1, number_of_personids_after)
def test_m_names_cache(self, mocked_func, mocked_destroy): ''' For this test we check whether a value is in the cache. ''' def do_nothing(): ''' A function that does nothing. It substitutes the destroy_mnames_pids_cache, so that we can actually take a snapshot of the cache for our test. ''' pass mocked_func.return_value = [9999] mocked_destroy.side_effect = do_nothing() rabbit([self.main_bibrec], verbose=True) m_name = create_matchable_name(self.author_name) self.assertTrue(invenio.bibauthorid_rabbit.M_NAME_PIDS_CACHE[m_name])
def rabbit_with_log(papers, check_invalid_papers, log_comment, partial=False): from invenio.bibauthorid_rabbit import rabbit personids_to_update_extids = _get_personids_to_update_extids(papers) starting_time = get_db_time() rabbit(papers, check_invalid_papers, personids_to_update_extids) if partial: action = 'PID_UPDATE_PARTIAL' else: action = 'PID_UPDATE' insert_user_log('daemon', '-1', action, 'bibsched', 'status', comment=log_comment, timestamp=starting_time)
def test_accuracy(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_personid_papers \ , check_results \ , repair_personid from invenio.bibauthorid_merge import matched_claims if not check_personid_papers(): rabbit([]) repair_personid() rabbit([]) assert check_personid_papers() tortoise(pure=True) assert check_results() return matched_claims()
def test_accuracy(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_author_paper_associations \ , duplicated_tortoise_results_exist \ , repair_author_paper_associations from invenio.bibauthorid_merge import matched_claims if not check_author_paper_associations(): rabbit([]) repair_author_paper_associations() rabbit([]) assert check_author_paper_associations() tortoise(pure=True) assert duplicated_tortoise_results_exist() return matched_claims()
def test_accuracy(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_author_paper_associations \ , duplicated_tortoise_results_exist \ , repair_author_paper_associations from invenio.bibauthorid_merge import matched_claims if not check_author_paper_associations(): rabbit([]) repair_author_paper_associations() rabbit([]) assert check_author_paper_associations() tortoise(pure=True) assert duplicated_tortoise_results_exist() return matched_claims()
def test_accuracy(): from invenio.bibauthorid_tortoise import tortoise from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_personid_papers \ , check_results \ , repair_personid from invenio.bibauthorid_merge import matched_claims if not check_personid_papers(): rabbit([]) repair_personid() rabbit([]) assert check_personid_papers() tortoise(pure=True) assert check_results() return matched_claims()
def test_rabbit_remove_author_from_paper(): ''' The author field of the record is removed. Tests whether the author is actually removed by rabbit. ''' number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) number_of_personids_after = get_count_of_pids() self.assertEquals(number_of_personids_before, number_of_personids_after + 1) self.assertFalse( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertFalse(person_in_aidpersoniddata(self.author_name))
def test_rabbit_add_new_paper_with_four_coauthors(): rabbit([self.main_bibrec], verbose=True) self.current_bibref_value_of_author = get_bibref_value_for_name( self.author_name) self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(self.author_name)) self.current_bdentifiers = [( self.ext_id, 'i', )] + [None for i in range(len(self.co_authors_names))] self.current_bibref_values_of_coauthors = list() for coauthor_name in self.co_authors_names: bibref_value = get_bibref_value_for_name(coauthor_name) self.current_bibref_values_of_coauthors.append(bibref_value) self.assertTrue( person_in_aidpersonidpapers(coauthor_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(coauthor_name))
def test_rabbit_add_author_again(): ''' The author field of the record is re-added. Tests whether the author is added again to aidPERSONIDPAPERS and aidPERSONIDDATA. ''' number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) previous_bibref_value = self.current_bibref_value self.current_bibref_value = get_bibref_value_for_name( self.author_name) number_of_personids_after = get_count_of_pids() self.assertEquals(previous_bibref_value, self.current_bibref_value) self.assertEquals(number_of_personids_after, number_of_personids_before + 1) self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(self.author_name))
def safe_merger(): from invenio.bibauthorid_merge import merge_static from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_author_paper_associations \ , duplicated_tortoise_results_exist \ , merger_errors_exist \ , repair_author_paper_associations \ , back_up_author_paper_associations \ , compare_personids assert duplicated_tortoise_results_exist() if not check_author_paper_associations(): rabbit([]) repair_author_paper_associations() rabbit([]) assert check_author_paper_associations() back_up_author_paper_associations() merge_static() assert check_author_paper_associations() assert merger_errors_exist() compare_personids("/tmp/merge_diff")
def safe_merger(): from invenio.bibauthorid_merge import merge_static from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_personid_papers \ , check_results \ , check_merger \ , repair_personid \ , copy_personids \ , compare_personids assert check_results() if not check_personid_papers(): rabbit([]) repair_personid() rabbit([]) assert check_personid_papers() copy_personids() merge_static() assert check_personid_papers() assert check_merger() compare_personids("/tmp/merge_diff")
def test_rabbit_remove_coauthors_from_paper(): number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) number_of_personids_after = get_count_of_pids() self.assertEquals(number_of_personids_before, number_of_personids_after + 4) self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(self.author_name)) for coauthor_name in self.co_authors_names: self.assertFalse( person_in_aidpersonidpapers(coauthor_name, self.main_bibrec)) self.assertFalse(person_in_aidpersoniddata(coauthor_name))
def safe_merger(): from invenio.bibauthorid_merge import merge_static from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_author_paper_associations \ , duplicated_tortoise_results_exist \ , merger_errors_exist \ , repair_author_paper_associations \ , back_up_author_paper_associations \ , compare_personids assert duplicated_tortoise_results_exist() if not check_author_paper_associations(): rabbit([]) repair_author_paper_associations() rabbit([]) assert check_author_paper_associations() back_up_author_paper_associations() merge_static() assert check_author_paper_associations() assert merger_errors_exist() compare_personids("/tmp/merge_diff")
def safe_merger(): from invenio.bibauthorid_merge import merge_static from invenio.bibauthorid_rabbit import rabbit from invenio.bibauthorid_personid_maintenance import check_personid_papers \ , check_results \ , check_merger \ , repair_personid \ , copy_personids \ , compare_personids assert check_results() if not check_personid_papers(): rabbit([]) repair_personid() rabbit([]) assert check_personid_papers() copy_personids() merge_static() assert check_personid_papers() assert check_merger() compare_personids("/tmp/merge_diff")
def test_m_names_transformations(self): ''' In this test we define three functions and then use them as the functions that generate mnames. ''' def m_name_func_1(name): m_name_func_1.has_been_called = True return invenio.bibauthorid_rabbit.M_NAME_FUNCTIONS[0](name) m_name_func_1.has_been_called = False def m_name_func_2(name): m_name_func_2.has_been_called = True return invenio.bibauthorid_rabbit.M_NAME_FUNCTIONS[0](name) m_name_func_2.has_been_called = False invenio.bibauthorid_rabbit.M_NAME_FUNCTIONS[1:] = [ m_name_func_1, m_name_func_2 ] rabbit([self.main_bibrec], verbose=True) self.assertTrue(m_name_func_1.has_been_called) self.assertTrue(m_name_func_2.has_been_called)
def test_rabbit_claim_record(): ''' The test record is artificially being claimed. Then, the name of the author is being modified: i) slightly A slight modification of a claimed record should have the same behavior as before: Name changes in aidPERSONIDPAPERS but not in aidPERSONIDDATA. ii) heavily Due to the fact that the paper is claimed the canonical name should NOT change in aidPERSONIDDATA. ''' number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) claim_test_paper(self.main_bibrec) self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.slightly_modified_author_name) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) number_of_personids_after = get_count_of_pids() self.assertEquals(number_of_personids_before, number_of_personids_after) self.assertTrue( person_in_aidpersonidpapers(self.slightly_modified_author_name, self.main_bibrec)) self.assertFalse( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(is_test_paper_claimed(self.main_bibrec, 100)) self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.heavily_modified_name) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) self.assertTrue( person_in_aidpersonidpapers(self.heavily_modified_name, self.main_bibrec)) self.assertFalse( person_in_aidpersonidpapers(self.slightly_modified_author_name, self.main_bibrec)) self.assertFalse(is_test_paper_claimed(self.main_bibrec, 100))
def test_rabbit_claim_record(): self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name, co_authors_names=self.co_authors_names) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) claim_test_paper(self.main_bibrec) number_of_personids_before = get_count_of_pids() self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name, co_authors_names=self.slightly_mod_co_authors_names) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) number_of_personids_after = get_count_of_pids() self.assertEquals(number_of_personids_before, number_of_personids_after) self.assertTrue(is_test_paper_claimed(self.main_bibrec, 700)) self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(self.author_name)) for coauthor_name in self.slightly_mod_co_authors_names: self.assertTrue( person_in_aidpersonidpapers(coauthor_name, self.main_bibrec)) for coauthor_name in self.co_authors_names: self.assertTrue(person_in_aidpersoniddata(coauthor_name)) self.main_marcxml_record = get_modified_marc_for_test( self.main_marcxml_record, author_name=self.author_name, co_authors_names=self.heavily_mod_co_authors_names) self.main_bibrec = get_bibrec_for_record(self.main_marcxml_record, opt_mode='replace') rabbit([self.main_bibrec], verbose=True) self.assertTrue( person_in_aidpersonidpapers(self.author_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(self.author_name)) for coauthor_name in self.heavily_mod_co_authors_names: self.assertTrue( person_in_aidpersonidpapers(coauthor_name, self.main_bibrec)) self.assertTrue(person_in_aidpersoniddata(coauthor_name)) self.assertFalse(is_test_paper_claimed(self.main_bibrec, 700))
float(i) / total_updates, '%s out of %s (%s)' % (str(i), str(total_updates), str(bibref))) try: name = get_name_by_bibref(bibref) except AssertionError, error: if "A bibref must have exactly one name" in error.message: records_for_rabbit.add(bibref[1]) else: raise error else: m_name = create_matchable_name(name) run_sql( "update aidPERSONIDPAPERS set name=%s, m_name=%s where bibref_table=%s " "and bibref_value=%s ", (name, m_name, bibref[0], bibref[1])) if records_for_rabbit: rabbit(records_for_rabbit) logger.update_status(1., 'Finished') run_sql( "alter table aidPERSONIDDATA modify data varchar(255) not null default '' " ) def estimate(): """ Let's assume 2ms per sql query in a standard production environment, with some safety margin. """ n = run_sql("select count(*) from aidPERSONIDPAPERS")[0][0] queries = n * 2 return 0.002 * queries
def setUpClass(cls): if cls.run_exec: return cls.run_exec = True cls.verbose = 0 cls.logger = setup_loggers() cls.logger.info('Setting up regression tests...') task_set_task_param('verbose', cls.verbose) cls.authors = { 'author1': { 'name': 'authoraaaaa authoraaaab', 'inspireID': 'INSPIRE-FAKE_ID1' }, 'author2': { 'name': 'authorbbbba authorbbbbb', 'inspireID': 'INSPIRE-FAKE_ID2' }, 'author3': { 'name': 'authorcccca authorccccb', 'inspireID': 'INSPIRE-FAKE_ID3' }, 'author4': { 'name': 'authordddda authorddddb', 'inspireID': 'INSPIRE-FAKE_ID4' }, 'author5': { 'name': 'authoreeeea authoreeeeb', 'inspireID': 'INSPIRE-FAKE_ID5' }, 'author6': { 'name': 'authorffffa authorffffb', 'inspireID': 'INSPIRE-FAKE_ID6' }, 'author7': { 'name': 'authorgggga authorggggb', 'inspireID': 'INSPIRE-FAKE_ID7' }, 'author8': { 'name': 'authorhhhha authorhhhhb', 'inspireID': 'INSPIRE-FAKE_ID8' }, 'author9': { 'name': 'authoriiiia authoriiiib', 'inspireID': 'INSPIRE-FAKE_ID9' }, 'author10': { 'name': 'authorjjjja authorjjjjb', 'inspireID': 'INSPIRE-FAKE_ID10' }, 'author11': { 'name': 'authorkkkka authorkkkkb', 'inspireID': 'INSPIRE-FAKE_ID11' }, 'author12': { 'name': 'authorlllla authorllllb', 'inspireID': 'INSPIRE-FAKE_ID12' }, 'author13': { 'name': 'authormmmma authormmmmb', 'inspireID': 'INSPIRE-FAKE_ID13' }, 'author14': { 'name': 'authornnnna authornnnnb', 'inspireID': 'INSPIRE-FAKE_ID14' }, 'author15': { 'name': 'authorooooa authoroooob', 'inspireID': 'INSPIRE-FAKE_ID15' }, 'author16': { 'name': 'authorppppa authorppppb', 'inspireID': 'INSPIRE-FAKE_ID16' }, 'author17': { 'name': 'authorqqqqa authorqqqqb', 'inspireID': 'INSPIRE-FAKE_ID17' }, 'author18': { 'name': 'authorrrrra authorrrrrb', 'inspireID': 'INSPIRE-FAKE_ID18' }, 'author19': { 'name': 'authorssssa authorssssb', 'inspireID': 'INSPIRE-FAKE_ID19' } } cls.marc_xmls = dict() cls.bibrecs = dict() cls.pids = dict() cls.bibrefs = dict() def set_up_test_hoover_inertia(): cls.marc_xmls['paper1'] = get_new_marc_for_test( 'Test Paper', cls.authors['author1']['name'], limit_to_collections=True) cls.bibrecs['paper1'] = get_bibrec_for_record( cls.marc_xmls['paper1'], opt_mode='insert') cls.marc_xmls['paper1'] = add_001_field(cls.marc_xmls['paper1'], cls.bibrecs['paper1']) def set_up_test_hoover_duplication(): cls.marc_xmls['paper2'] = get_new_marc_for_test( 'Test Paper', cls.authors['author2']['name'], None, ((cls.authors['author2']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper2'] = get_bibrec_for_record( cls.marc_xmls['paper2'], opt_mode='insert') cls.marc_xmls['paper2'] = add_001_field(cls.marc_xmls['paper2'], cls.bibrecs['paper2']) def set_up_test_hoover_assign_one_inspire_id_from_an_unclaimed_paper(): cls.marc_xmls['paper3'] = get_new_marc_for_test( 'Test Paper', cls.authors['author3']['name'], None, ((cls.authors['author3']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper3'] = get_bibrec_for_record( cls.marc_xmls['paper3'], opt_mode='insert') cls.marc_xmls['paper3'] = add_001_field(cls.marc_xmls['paper3'], cls.bibrecs['paper3']) def set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper(): cls.marc_xmls['paper4'] = get_new_marc_for_test( 'Test Paper', cls.authors['author4']['name'], None, ((cls.authors['author4']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper4'] = get_bibrec_for_record( cls.marc_xmls['paper4'], opt_mode='insert') cls.marc_xmls['paper4'] = add_001_field(cls.marc_xmls['paper4'], cls.bibrecs['paper4']) def set_up_test_hoover_assign_one_inspire_id_from_unclaimed_papers_with_different_inspireID( ): cls.marc_xmls['paper5'] = get_new_marc_for_test( 'Test Paper', cls.authors['author5']['name'], None, ((cls.authors['author5']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper5'] = get_bibrec_for_record( cls.marc_xmls['paper5'], opt_mode='insert') cls.marc_xmls['paper5'] = add_001_field(cls.marc_xmls['paper5'], cls.bibrecs['paper5']) cls.marc_xmls['paper6'] = get_new_marc_for_test( 'Test Paper', cls.authors['author5']['name'], None, ((cls.authors['author6']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper6'] = get_bibrec_for_record( cls.marc_xmls['paper6'], opt_mode='insert') cls.marc_xmls['paper6'] = add_001_field(cls.marc_xmls['paper6'], cls.bibrecs['paper6']) def set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper_and_unclaimed_paper_with_different_inspireID( ): cls.marc_xmls['paper7'] = get_new_marc_for_test( 'Test Paper', cls.authors['author7']['name'], None, ((cls.authors['author7']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper7'] = get_bibrec_for_record( cls.marc_xmls['paper7'], opt_mode='insert') cls.marc_xmls['paper7'] = add_001_field(cls.marc_xmls['paper7'], cls.bibrecs['paper7']) cls.marc_xmls['paper8'] = get_new_marc_for_test( 'Test Paper', cls.authors['author7']['name'], None, ((cls.authors['author8']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper8'] = get_bibrec_for_record( cls.marc_xmls['paper8'], opt_mode='insert') cls.marc_xmls['paper8'] = add_001_field(cls.marc_xmls['paper8'], cls.bibrecs['paper8']) def set_up_test_hoover_assign_one_inspire_id_from_claimed_papers_with_different_inspireID( ): cls.marc_xmls['paper9'] = get_new_marc_for_test( 'Test Paper', cls.authors['author9']['name'], None, ((cls.authors['author2']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper9'] = get_bibrec_for_record( cls.marc_xmls['paper9'], opt_mode='insert') cls.marc_xmls['paper9'] = add_001_field(cls.marc_xmls['paper9'], cls.bibrecs['paper9']) cls.marc_xmls['paper10'] = get_new_marc_for_test( 'Test Paper', cls.authors['author9']['name'], None, ((cls.authors['author10']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper10'] = get_bibrec_for_record( cls.marc_xmls['paper10'], opt_mode='insert') cls.marc_xmls['paper10'] = add_001_field(cls.marc_xmls['paper10'], cls.bibrecs['paper10']) def set_up_test_hoover_vacuum_an_unclaimed_paper_with_an_inspire_id_from_a_claimed_paper( ): cls.marc_xmls['paper11'] = get_new_marc_for_test( 'Test Paper', cls.authors['author11']['name'], None, ((cls.authors['author11']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper11'] = get_bibrec_for_record( cls.marc_xmls['paper11'], opt_mode='insert') cls.marc_xmls['paper11'] = add_001_field(cls.marc_xmls['paper11'], cls.bibrecs['paper11']) cls.marc_xmls['paper12'] = get_new_marc_for_test( 'Test Paper', cls.authors['author12']['name'], None, ((cls.authors['author11']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper12'] = get_bibrec_for_record( cls.marc_xmls['paper12'], opt_mode='insert') cls.marc_xmls['paper12'] = add_001_field(cls.marc_xmls['paper12'], cls.bibrecs['paper12']) def set_up_test_hoover_vacuum_a_claimed_paper_with_an_inspire_id_from_a_claimed_paper( ): cls.marc_xmls['paper13'] = get_new_marc_for_test( 'Test Paper', cls.authors['author13']['name'], None, ((cls.authors['author13']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper13'] = get_bibrec_for_record( cls.marc_xmls['paper13'], opt_mode='insert') cls.marc_xmls['paper13'] = add_001_field(cls.marc_xmls['paper13'], cls.bibrecs['paper13']) cls.marc_xmls['paper14'] = get_new_marc_for_test( 'Test Paper', cls.authors['author14']['name'], None, ((cls.authors['author13']['inspireID'], 'i'), ), limit_to_collections=True) cls.bibrecs['paper14'] = get_bibrec_for_record( cls.marc_xmls['paper14'], opt_mode='insert') cls.marc_xmls['paper14'] = add_001_field(cls.marc_xmls['paper14'], cls.bibrecs['paper14']) def set_up_test_hoover_assign_one_inspire_id_from_hepnames_record(): cls.marc_xmls['paper15'] = get_new_hepnames_marc_for_test( cls.authors['author15']['name'], ((cls.authors['author15']['inspireID'], 'i'), )) cls.bibrecs['paper15'] = get_bibrec_for_record( cls.marc_xmls['paper15'], opt_mode='insert') cls.marc_xmls['paper15'] = add_001_field(cls.marc_xmls['paper15'], cls.bibrecs['paper15']) def set_up_duplicated_unclaimed_signature(): cls.marc_xmls['paper16'] = get_new_marc_for_test( 'Test Paper', cls.authors['author16']['name'], (cls.authors['author17']['name'], ), ((cls.authors['author16']['inspireID'], 'i'), (cls.authors['author16']['inspireID'], 'i')), limit_to_collections=True) cls.bibrecs['paper16'] = get_bibrec_for_record( cls.marc_xmls['paper16'], opt_mode='insert') cls.marc_xmls['paper16'] = add_001_field(cls.marc_xmls['paper16'], cls.bibrecs['paper16']) def set_up_duplicated_claimed_signature(): cls.marc_xmls['paper18'] = get_new_marc_for_test( 'Test Paper', cls.authors['author18']['name'], (cls.authors['author19']['name'], ), ((cls.authors['author18']['inspireID'], 'i'), (cls.authors['author18']['inspireID'], 'i')), limit_to_collections=True) cls.bibrecs['paper18'] = get_bibrec_for_record( cls.marc_xmls['paper18'], opt_mode='insert') cls.marc_xmls['paper18'] = add_001_field(cls.marc_xmls['paper18'], cls.bibrecs['paper18']) set_up_test_hoover_inertia() set_up_test_hoover_duplication() set_up_test_hoover_assign_one_inspire_id_from_an_unclaimed_paper() set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper() set_up_test_hoover_assign_one_inspire_id_from_unclaimed_papers_with_different_inspireID( ) set_up_test_hoover_assign_one_inspire_id_from_a_claimed_paper_and_unclaimed_paper_with_different_inspireID( ) set_up_test_hoover_assign_one_inspire_id_from_claimed_papers_with_different_inspireID( ) set_up_test_hoover_vacuum_an_unclaimed_paper_with_an_inspire_id_from_a_claimed_paper( ) set_up_test_hoover_vacuum_a_claimed_paper_with_an_inspire_id_from_a_claimed_paper( ) set_up_test_hoover_assign_one_inspire_id_from_hepnames_record() set_up_duplicated_unclaimed_signature() set_up_duplicated_claimed_signature() cls.bibrecs_to_clean = [cls.bibrecs[key] for key in cls.bibrecs] rabbit(sorted([cls.bibrecs[key] for key in cls.bibrecs]), verbose=False) for key in cls.authors: try: temp = set() cls.bibrefs[key] = get_bibref_value_for_name( cls.authors[key]['name']) temp = run_sql( "select personid from aidPERSONIDPAPERS where bibref_value=%s and bibrec=%s and name=%s", (cls.bibrefs[key], cls.bibrecs[key.replace( 'author', 'paper')], cls.authors[key]['name'])) cls.pids[key] = temp[0][0] if temp else () except KeyError as e: print e claim_test_paper(cls.bibrecs['paper4']) claim_test_paper(cls.bibrecs['paper7']) claim_test_paper(cls.bibrecs['paper9']) claim_test_paper(cls.bibrecs['paper10']) claim_test_paper(cls.bibrecs['paper11']) claim_test_paper(cls.bibrecs['paper13']) claim_test_paper(cls.bibrecs['paper14']) claim_test_paper(cls.bibrecs['paper18']) tmp_claimed_exception = invenio.bibauthorid_hoover.DuplicateClaimedPaperException tmp_unclaimed_exception = invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException class MockClaimedException( invenio.bibauthorid_hoover.DuplicateClaimedPaperException): def __init__(self, message, pid, signature, present_signatures): global dupl super(MockClaimedException, self).__init__(message, pid, signature, present_signatures) dupl += 1 class MockUnclaimedException( invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException): def __init__(self, message, _pid, signature, present_signatures): global pid super(MockUnclaimedException, self).__init__(message, _pid, signature, present_signatures) pid = _pid invenio.bibauthorid_hoover.DuplicateClaimedPaperException = MockClaimedException invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException = MockUnclaimedException hoover(list(set(cls.pids[key] for key in cls.pids if cls.pids[key]))) invenio.bibauthorid_hoover.DuplicateClaimedPaperException = tmp_claimed_exception invenio.bibauthorid_hoover.DuplicateUnclaimedPaperException = tmp_unclaimed_exception print "dupl", dupl