def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti = 0, 0 for cl1 in cluster_set.clusters: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti = 0, 0 for cl1 in cluster_set.clusters: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def __init__(self, cluster_set, last_name="", cached=[], use_cache=False, save_cache=False): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. @param last_name: A string which defines the current cluster of names. It is used only if use_cache or save_cache is true. @param cached: A list with the bibs, which are not touched since last save. ''' self._bib_matrix = self.bib_matrix(cluster_set) old_matrix = self.bib_matrix() if use_cache and probability_table_exists(): old_matrix.load(last_name) elif cached: raise AssertionError("You cannot have cached" "results and empty table!") for cl1 in cluster_set.clusters: for cl2 in cluster_set.clusters: if id(cl1) != id(cl2) and cl1.hates(cl2) == False: for bib1 in cl1.bibs: for bib2 in cl2.bibs: if bib1 in cached and bib2 in cached: val = old_matrix[bib1, bib2] if val == None: val = compare_bibrefrecs(bib1, bib2) else: val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val if save_cache: if not probability_table_exists(): create_probability_table() self._bib_matrix.store(last_name)
def __init__(self, cluster_set, last_name="", cached = [], use_cache = False, save_cache = False): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. @param last_name: A string which defines the current cluster of names. It is used only if use_cache or save_cache is true. @param cached: A list with the bibs, which are not touched since last save. ''' self._bib_matrix = self.bib_matrix(cluster_set) old_matrix = self.bib_matrix() if use_cache and probability_table_exists(): old_matrix.load(last_name) elif cached: raise AssertionError("You cannot have cached" "results and empty table!") for cl1 in cluster_set.clusters: for cl2 in cluster_set.clusters: if id(cl1) != id(cl2) and cl1.hates(cl2) == False: for bib1 in cl1.bibs: for bib2 in cl2.bibs: if bib1 in cached and bib2 in cached: val = old_matrix[bib1, bib2] if val == None: val = compare_bibrefrecs(bib1, bib2) else: val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val if save_cache: if not probability_table_exists(): create_probability_table() self._bib_matrix.store(last_name)
def __init__(self, cluster_set, use_cache=False, save_cache=False): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' def check_for_cleaning(cur_calc): if cur_calc % 10000000 == 0: clear_comparison_caches() self._bib_matrix = bib_matrix(cluster_set) old_matrix = bib_matrix() ncl = sum(len(cl.bibs) for cl in cluster_set.clusters) expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 if use_cache and old_matrix.load(cluster_set.last_name): cached_bibs = set(filter_modified_record_ids( old_matrix.get_keys(), old_matrix.creation_time)) else: cached_bibs = set() if save_cache: creation_time = get_sql_time() cur_calc, opti = 0, 0 for cl1 in cluster_set.clusters: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 check_for_cleaning(cur_calc) val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 check_for_cleaning(cur_calc) val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() if save_cache: update_status(1., "saving...") self._bib_matrix.store(cluster_set.last_name, creation_time) update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))