def setUp(self): """ Set up an empty bibmatrix and one filled with ten clusters of 10 elements each. """ self.bm = Bib_matrix('testname', storage_dir_override='/tmp/') self.css = ClusterSet() self.css.clusters = [ClusterSet.Cluster(range(i*10,i*10+10)) for i in range(10)] self.css.update_bibs() self.bmcs0 = Bib_matrix('testname2', self.css, storage_dir_override='/tmp/')
def FIXME_1678_test_save_matrix(self): ''' Matrix should save, be loadable, and stay equal to a newly loaded one on the same files ''' self.bmcs0.store() loaded = Bib_matrix('testname2', storage_dir_override='/tmp/') self.assertTrue(loaded.load()) bmcs0 = self.bmcs0 for i in range(100): for j in range(100): self.assertTrue(bmcs0[i,j] == loaded[i,j])
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 self._bib_matrix.store() try: old_matrix = Bib_matrix(self._bib_matrix.name + 'copy') old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name + 'copy') old_matrix.load() cached_bibs = self.__get_up_to_date_bibs(old_matrix) have_cached_bibs = bool(cached_bibs) except IOError: old_matrix.destroy() cached_bibs = None have_cached_bibs = False self._bib_matrix.destroy() self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 try: cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc + opti - prints_counter > 100000 or cur_calc == 0: update_status( (float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc + opti # #clean caches if cur_calc - last_cleaned > 20000000: gc.collect() # clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs: try: val = old_matrix[bib1, bib2] opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) except KeyError: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val except Exception, e: raise Exception("""Error happened in prob_matrix.recalculate with val:%s original_exception: %s """ % (str(val), str(e)))
class TestBibMatrix(InvenioTestCase): def setUp(self): """ Set up an empty bibmatrix and one filled with ten clusters of 10 elements each. """ self.bm = Bib_matrix('testname', storage_dir_override='/tmp/') self.css = ClusterSet() self.css.clusters = [ClusterSet.Cluster(range(i*10,i*10+10)) for i in range(10)] self.css.update_bibs() self.bmcs0 = Bib_matrix('testname2', self.css, storage_dir_override='/tmp/') def tearDown(self): self.bm.destroy() self.bmcs0.destroy() def test_resolve_entry_simmetry(self): ''' Bib matrix stores a triangular matrix. Entries should be symmetric. ''' for j in range(100): for k in range(100): self.assertTrue( self.bmcs0._resolve_entry((j,k))==self.bmcs0._resolve_entry((k,j)) ) def test_resolve_entry_unicity(self): ''' resolve_entry should produce unuque indexes for any couple of values ''' ntests = 30 testvalues = set((i,j) for i in range(ntests) for j in range(ntests)) for k in range(ntests): for z in range(ntests): tvalues = testvalues - set([(k,z)]) - set([(z,k)]) val = self.bmcs0._resolve_entry((k,z)) allvalues = set(self.bmcs0._resolve_entry(v) for v in tvalues) self.assertFalse( val in allvalues , str(val)+' is in, from '+str((k,z))) def test_matrix_content(self): ''' The matrix should be simmetric, and values should be preserved ''' for i in range(100): for j in range(i+1): self.bmcs0[i,j] = (i,j) for i in range(100): for j in range(i+1,100): val = self.bmcs0[i,j] if i < j: k,z = j,i else: k,z = i,j self.assertTrue(val[0] == k) self.assertTrue(val[1] == z) def test_create_empty_matrix(self): """ All elements should be None """ for i in range(9,10): for j in range(i*10,i*10+10): for k in range(i*10,i*10+10): self.assertTrue(self.bmcs0[(j,k)] == None) @nottest def FIXME_1678_test_save_matrix(self): ''' Matrix should save, be loadable, and stay equal to a newly loaded one on the same files ''' self.bmcs0.store() loaded = Bib_matrix('testname2', storage_dir_override='/tmp/') self.assertTrue(loaded.load()) bmcs0 = self.bmcs0 for i in range(100): for j in range(100): self.assertTrue(bmcs0[i,j] == loaded[i,j]) def test_duplicate_existing(self): self.bmcs0.store() self.bm.duplicate_existing('testname2','testnameduplicate') self.assertTrue(self.bmcs0.load()) self.assertTrue(self.bm.load()) bmcs0 = self.bmcs0 bm = self.bm for i in range(100): for j in range(100): self.assertTrue(bmcs0[i,j] == bm[i,j]) def test_special_items(self): self.bmcs0[0,0] = '+' self.bmcs0[0,1] = '-' self.bmcs0[0,2] = None self.assertTrue(self.bmcs0[0,0] == '+') self.assertTrue(self.bmcs0[0,1] == '-') self.assertTrue(self.bmcs0[0,2] is None) def test_getitem_numeric(self): self.bmcs0[0,0] = '+' self.bmcs0[0,1] = '-' self.bmcs0[0,2] = None self.assertTrue(self.bmcs0.getitem_numeric([0,0])[0] == -2) self.assertTrue(self.bmcs0.getitem_numeric([0,1])[0] == -1) self.assertTrue(self.bmcs0.getitem_numeric([0,2])[0] == -3)
def __init__(self, name): self._bib_matrix = Bib_matrix(name)
class ProbabilityMatrix(object): ''' This class contains and maintains the comparison between all virtual authors. It is able to write and read from the database and update the results. ''' def __init__(self, name): self._bib_matrix = Bib_matrix(name) def load(self, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load() update_status_final("Probability matrix loaded.") def store(self): update_status(0., "Saving probability matrix...") self._bib_matrix.store() update_status_final("Probability matrix saved.") def __getitem__(self, bibs): return self._bib_matrix[bibs[0], bibs[1]] def getitem_numeric(self, bibs): return self._bib_matrix.getitem_numeric(bibs) def __get_up_to_date_bibs(self, bib_matrix): return frozenset( get_modified_papers_before(bib_matrix.get_keys(), bib_matrix.creation_time)) def is_up_to_date(self, cluster_set): return self.__get_up_to_date_bibs(self._bib_matrix) >= frozenset( cluster_set.all_bibs()) def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 self._bib_matrix.store() try: old_matrix = Bib_matrix(self._bib_matrix.name + 'copy') old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name + 'copy') old_matrix.load() cached_bibs = self.__get_up_to_date_bibs(old_matrix) have_cached_bibs = bool(cached_bibs) except IOError: old_matrix.destroy() cached_bibs = None have_cached_bibs = False self._bib_matrix.destroy() self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 try: cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc + opti - prints_counter > 100000 or cur_calc == 0: update_status( (float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc + opti # #clean caches if cur_calc - last_cleaned > 20000000: gc.collect() # clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs: try: val = old_matrix[bib1, bib2] opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) except KeyError: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val except Exception, e: raise Exception("""Error happened in prob_matrix.recalculate with val:%s original_exception: %s """ % (str(val), str(e))) clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 self._bib_matrix.store() try: old_matrix = Bib_matrix(self._bib_matrix.name+'copy') old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name+'copy') old_matrix.load() cached_bibs = self.__get_up_to_date_bibs(old_matrix) have_cached_bibs = bool(cached_bibs) except IOError: old_matrix.destroy() cached_bibs = None have_cached_bibs = False self._bib_matrix.destroy() self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 try: cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc+opti - prints_counter > 100000 or cur_calc == 0: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc+opti # #clean caches if cur_calc - last_cleaned > 20000000: gc.collect() # clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs: try: val = old_matrix[bib1, bib2] opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) except KeyError: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val except Exception, e: raise Exception("""Error happened in prob_matrix.recalculate with val:%s original_exception: %s """%(str(val),str(e)))
class ProbabilityMatrix(object): ''' This class contains and maintains the comparison between all virtual authors. It is able to write and read from the database and update the results. ''' def __init__(self, name): self._bib_matrix = Bib_matrix(name) def load(self, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load() update_status_final("Probability matrix loaded.") def store(self): update_status(0., "Saving probability matrix...") self._bib_matrix.store() update_status_final("Probability matrix saved.") def __getitem__(self, bibs): return self._bib_matrix[bibs[0], bibs[1]] def getitem_numeric(self, bibs): return self._bib_matrix.getitem_numeric(bibs) def __get_up_to_date_bibs(self, bib_matrix): return frozenset(get_modified_papers_before( bib_matrix.get_keys(), bib_matrix.creation_time)) def is_up_to_date(self, cluster_set): return self.__get_up_to_date_bibs(self._bib_matrix) >= frozenset(cluster_set.all_bibs()) def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 self._bib_matrix.store() try: old_matrix = Bib_matrix(self._bib_matrix.name+'copy') old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name+'copy') old_matrix.load() cached_bibs = self.__get_up_to_date_bibs(old_matrix) have_cached_bibs = bool(cached_bibs) except IOError: old_matrix.destroy() cached_bibs = None have_cached_bibs = False self._bib_matrix.destroy() self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 try: cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc+opti - prints_counter > 100000 or cur_calc == 0: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc+opti # #clean caches if cur_calc - last_cleaned > 20000000: gc.collect() # clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs: try: val = old_matrix[bib1, bib2] opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) except KeyError: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val except Exception, e: raise Exception("""Error happened in prob_matrix.recalculate with val:%s original_exception: %s """%(str(val),str(e))) clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))