def test(): print( '-----------------------------------------------------------------------' ) print('CLUSTERING vs CLASSICAL ALGORITHMS TEST') print( '-----------------------------------------------------------------------' ) connector = PostgresConnector() clustering_cost = combined_horizontal_from_db(connector, 'euclidean_distance', 'single_linkage') frequencies = [1] * 9 predicate_usage = [[] for _ in queries] predicates = list(set().union(*queries)) for predicate_idx, predicate in enumerate(predicates): for query_idx, query in enumerate(queries): if predicate in query: predicate_usage[query_idx].append(predicate_idx) zhang_cost = zhang(frequencies, predicates, predicate_usage, connector) print( '***********************************************************************' ) print('RESULT ') print('clustering cost: {} zhang_cost {}'.format(clustering_cost, zhang_cost))
def test_different_duplicates_percentage(): print('ZHANG ALGORITHM <DIFFERENT DUPLICATES PERCENTAGE TESTS>') percentages = [10, 20, 30, 40, 50, 60, 70] results = {} for percentage in percentages: print("DUPLICATES PERCENTAGE = " + str(percentage)) directory = '../input_data/queries/zhang/' + str(percentage) input_files = os.listdir(directory) queries_files = [ pickle.load(open(directory + '/' + file, 'rb')) for file in input_files ] cost_model_calls = 0 cost = 0 for queries_file in queries_files: predicate_usage = [[] for _ in queries_file] predicates = list(set().union(*queries_file)) for predicate_idx, predicate in enumerate(predicates): for query_idx, query in enumerate(queries_file): if predicate in query: predicate_usage[query_idx].append(predicate_idx) result = zhang([1] * len(queries_file), predicates, predicate_usage, queries_file, PostgresConnector()) cost_model_calls += result['cost_model_calls'] cost += result['cost'] results[percentage] = { 'cost model calls': cost_model_calls, 'cost': cost } with open(directory + '\../result_' + str(percentage) + '.txt', 'w') as f: print(results, file=f) print('ZHANG ALGORITHM <DIFFERENT DUPLICATES PERCENTAGE TESTS> RESULTS: ', results)
def test_my_values(): print( '-----------------------------------------------------------------------' ) print('ZHANG ALGORITHM TESTS') print( '-----------------------------------------------------------------------' ) predicates_amount = 9 frequencies = [1] * 9 predicate_usage = [[] for _ in queries] predicates = list(set().union(*queries)) for predicate_idx, predicate in enumerate(predicates): for query_idx, query in enumerate(queries): if predicate in query: predicate_usage[query_idx].append(predicate_idx) zhang(frequencies, predicates, predicate_usage, PostgresConnector())
def zhang_vs_clustering(): connector = PostgresConnector() percentages = [10, 20, 30, 40, 50, 60, 70] settings = [ # 'zhang', ('penalty_based', 'penalty_based') # ('single_linkage', 'euclidean_distance'), # ('complete_linkage', 'maximum_distance') ] tests_count = 0 results = {percentage: {setting: {'cost': 0, 'cost_function_calls': 0} for setting in settings} for percentage in percentages} for percentage in percentages: print("DUPLICATES PERCENTAGE = " + str(percentage)) directory = '../input_data/queries/zhang/' + str(percentage) input_files = os.listdir(directory) queries_files = [pickle.load(open(directory + '/' + file, 'rb')) for file in input_files] for idx, queries_file in enumerate(queries_files): print("INPUT FILE = " + directory + '/' + str(idx)) # zhang predicate_usage = [[] for _ in queries_file] predicates = list(set().union(*queries_file)) for predicate_idx, predicate in enumerate(predicates): for query_idx, query in enumerate(queries_file): if predicate in query: predicate_usage[query_idx].append(predicate_idx) zhang_result = zhang([1] * len(queries_file), predicates, predicate_usage, queries_file, PostgresConnector()) results[percentage]['zhang']['cost'] += zhang_result['cost'] results[percentage]['zhang']['cost_function_calls'] += zhang_result['cost_model_calls'] tests_count +=1 # clustering for setting in settings: if setting == 'zhang': continue clustering_results = combined_horizontal_from_db(connector, setting[1], setting[0], True, queries_file, False) tests_count += 1 print("TEST COUNT = " + str(tests_count)) results[percentage][setting]['cost'] += clustering_results['cost'] results[percentage][setting]['cost_function_calls'] += clustering_results['cost_function_calls'] with open(directory + '\../result_' + str(percentage) + '.txt', 'w') as f: print(results[percentage], file=f) print('ZHANG ALGORITHM <DIFFERENT DUPLICATES PERCENTAGE TESTS> RESULTS: ', results)
def compare_creation_time(): connector = PostgresConnector() print('test: compare creation times') distinct_clusters = {} new_cluster_idx = 0 for qs in itertools.product([0, 1], repeat=len(queries)): selected_queries = [ query if qs[idx] else 'NOT ' + query for idx, query in enumerate(queries) ] rows_amount = select_count(connector, selected_queries) if rows_amount: distinct_clusters[str(new_cluster_idx)] = [qs] new_cluster_idx += 1 cost = create_partitions(connector, distinct_clusters, table_name + '_copy') print(distinct_clusters) print('all clusters costs: ', cost)
def main(): connector = PostgresConnector() combined_horizontal_from_db(connector, 'euclidean_distance', 'single_linkage')
candidates_queries = [ query_idx for query_idx, query in enumerate(queries) if len(query) == min_length and predicate[0] not in [p[0] for p in query] ] queries[random.choice(candidates_queries)].append(predicate) # checks (just to be sure) for query in queries: columns = [predicate[0] for predicate in query] if len(columns) != len(set(columns)): assert len(columns) == len( set(columns) ), 'there are several predicates on the same column in one query' return # write to a file os.makedirs(directory, exist_ok=True) files_in_directory = os.listdir(directory) file_to_create = 0 if files_in_directory: file_to_create = max( [int(file_name) for file_name in files_in_directory]) + 1 with open(directory + '/' + str(file_to_create), 'wb') as fp: pickle.dump(queries, fp) if __name__ == "__main__": connector = PostgresConnector() generate_queries(connector)