Esempio n. 1
0
def test():
    print(
        '-----------------------------------------------------------------------'
    )
    print('CLUSTERING vs CLASSICAL ALGORITHMS TEST')
    print(
        '-----------------------------------------------------------------------'
    )
    connector = PostgresConnector()
    clustering_cost = combined_horizontal_from_db(connector,
                                                  'euclidean_distance',
                                                  'single_linkage')

    frequencies = [1] * 9
    predicate_usage = [[] for _ in queries]
    predicates = list(set().union(*queries))
    for predicate_idx, predicate in enumerate(predicates):
        for query_idx, query in enumerate(queries):
            if predicate in query:
                predicate_usage[query_idx].append(predicate_idx)
    zhang_cost = zhang(frequencies, predicates, predicate_usage, connector)

    print(
        '***********************************************************************'
    )
    print('RESULT ')
    print('clustering cost: {} zhang_cost {}'.format(clustering_cost,
                                                     zhang_cost))
Esempio n. 2
0
def test_different_duplicates_percentage():
    print('ZHANG ALGORITHM <DIFFERENT DUPLICATES PERCENTAGE TESTS>')
    percentages = [10, 20, 30, 40, 50, 60, 70]
    results = {}
    for percentage in percentages:
        print("DUPLICATES PERCENTAGE = " + str(percentage))
        directory = '../input_data/queries/zhang/' + str(percentage)
        input_files = os.listdir(directory)
        queries_files = [
            pickle.load(open(directory + '/' + file, 'rb'))
            for file in input_files
        ]
        cost_model_calls = 0
        cost = 0
        for queries_file in queries_files:
            predicate_usage = [[] for _ in queries_file]
            predicates = list(set().union(*queries_file))
            for predicate_idx, predicate in enumerate(predicates):
                for query_idx, query in enumerate(queries_file):
                    if predicate in query:
                        predicate_usage[query_idx].append(predicate_idx)
            result = zhang([1] * len(queries_file), predicates,
                           predicate_usage, queries_file, PostgresConnector())
            cost_model_calls += result['cost_model_calls']
            cost += result['cost']
        results[percentage] = {
            'cost model calls': cost_model_calls,
            'cost': cost
        }
        with open(directory + '\../result_' + str(percentage) + '.txt',
                  'w') as f:
            print(results, file=f)
    print('ZHANG ALGORITHM <DIFFERENT DUPLICATES PERCENTAGE TESTS> RESULTS: ',
          results)
Esempio n. 3
0
def test_my_values():
    print(
        '-----------------------------------------------------------------------'
    )
    print('ZHANG ALGORITHM TESTS')
    print(
        '-----------------------------------------------------------------------'
    )
    predicates_amount = 9
    frequencies = [1] * 9
    predicate_usage = [[] for _ in queries]
    predicates = list(set().union(*queries))
    for predicate_idx, predicate in enumerate(predicates):
        for query_idx, query in enumerate(queries):
            if predicate in query:
                predicate_usage[query_idx].append(predicate_idx)
    zhang(frequencies, predicates, predicate_usage, PostgresConnector())
def zhang_vs_clustering():
    connector = PostgresConnector()
    percentages = [10, 20, 30, 40, 50, 60, 70]
    settings = [
        # 'zhang',
                ('penalty_based', 'penalty_based')
                # ('single_linkage', 'euclidean_distance'),
                # ('complete_linkage', 'maximum_distance')
                ]
    tests_count = 0
    results = {percentage: {setting: {'cost': 0, 'cost_function_calls': 0} for setting in settings} for percentage in percentages}
    for percentage in percentages:
        print("DUPLICATES PERCENTAGE = " + str(percentage))
        directory = '../input_data/queries/zhang/' + str(percentage)
        input_files = os.listdir(directory)
        queries_files = [pickle.load(open(directory + '/' + file, 'rb')) for file in input_files]
        for idx, queries_file in enumerate(queries_files):
            print("INPUT FILE = " + directory + '/' + str(idx))
            # zhang
            predicate_usage = [[] for _ in queries_file]
            predicates = list(set().union(*queries_file))
            for predicate_idx, predicate in enumerate(predicates):
                for query_idx, query in enumerate(queries_file):
                    if predicate in query:
                        predicate_usage[query_idx].append(predicate_idx)
            zhang_result = zhang([1] * len(queries_file), predicates, predicate_usage, queries_file, PostgresConnector())
            results[percentage]['zhang']['cost'] += zhang_result['cost']
            results[percentage]['zhang']['cost_function_calls'] += zhang_result['cost_model_calls']
            tests_count +=1

            # clustering
            for setting in settings:
                if setting == 'zhang':
                    continue
                clustering_results = combined_horizontal_from_db(connector, setting[1], setting[0], True, queries_file, False)
                tests_count += 1
                print("TEST COUNT = " + str(tests_count))
                results[percentage][setting]['cost'] += clustering_results['cost']
                results[percentage][setting]['cost_function_calls'] += clustering_results['cost_function_calls']
        with open(directory + '\../result_' + str(percentage) + '.txt', 'w') as f:
            print(results[percentage], file=f)
    print('ZHANG ALGORITHM <DIFFERENT DUPLICATES PERCENTAGE TESTS> RESULTS: ', results)
Esempio n. 5
0
def compare_creation_time():
    connector = PostgresConnector()

    print('test: compare creation times')

    distinct_clusters = {}
    new_cluster_idx = 0
    for qs in itertools.product([0, 1], repeat=len(queries)):
        selected_queries = [
            query if qs[idx] else 'NOT ' + query
            for idx, query in enumerate(queries)
        ]
        rows_amount = select_count(connector, selected_queries)
        if rows_amount:
            distinct_clusters[str(new_cluster_idx)] = [qs]
            new_cluster_idx += 1
    cost = create_partitions(connector, distinct_clusters,
                             table_name + '_copy')
    print(distinct_clusters)
    print('all clusters costs: ', cost)
def main():

    connector = PostgresConnector()
    combined_horizontal_from_db(connector, 'euclidean_distance',
                                'single_linkage')
            candidates_queries = [
                query_idx for query_idx, query in enumerate(queries)
                if len(query) == min_length
                and predicate[0] not in [p[0] for p in query]
            ]
            queries[random.choice(candidates_queries)].append(predicate)

    # checks (just to be sure)
    for query in queries:
        columns = [predicate[0] for predicate in query]
        if len(columns) != len(set(columns)):
            assert len(columns) == len(
                set(columns)
            ), 'there are several predicates on the same column in one query'
            return

    # write to a file
    os.makedirs(directory, exist_ok=True)
    files_in_directory = os.listdir(directory)
    file_to_create = 0
    if files_in_directory:
        file_to_create = max(
            [int(file_name) for file_name in files_in_directory]) + 1
    with open(directory + '/' + str(file_to_create), 'wb') as fp:
        pickle.dump(queries, fp)


if __name__ == "__main__":
    connector = PostgresConnector()
    generate_queries(connector)