uf_cluster_editing_v4.py

from union_find import *
from math import log
import sys
import numpy as np
from numba import njit, jit
from numpy import random as rand
from model_sqrt import *
from merging_methods_v4_slim import *
import csv

"""
This module implements a cluster editing algorithm. It uses a semi-streaming approach and is therefore able to process files that would be too big for main memory. This is the classical version of uf_ce, there is a second approach available using rem's algorithm instead of union-find.
"""

# Input sollte aus je 3 mit Leerzeichen getrennten Einträgen pro Zeile bestehen:
# <Nummer Knoten 1> <Nummer Knoten 2> <Gewicht der Kante>
# Die Knotenbezeichnungen sind (von 0 bis n-1) numpy.int64, die Gewichte numpy.float64

# missing_weight: Gewicht für fehlende Kanten (die nicht in der Datei enthalten sind)
# n: Anzahl Objekte/Knoten
# x: Anzahl generierter Lösungen (mehr = besser, aber teurer in Speicher/Laufzeit)

def unionfind_cluster_editing(filename, output_path, missing_weight, n, x):

    """
    This is a cluster editing algorithm, based on semi-streaming approach using union find to analyze graph structures.
    The input file should contain edges in format
    <np.int64: edge 1> <np.int64: edge 2> <np.float64: edge weight>\n
    Parameter missing_weight sets a weight for edges that are not contained in the file (for unweighted data: -1)
    Parameter n gives the number of objects (nodes)
    Parameter x is the number of generated solutions (which are the basis for a merged solution). It merely influences running time, however with limited memory it should not be chosen too high. 300-1k is recommended, the more the better.
    """
    merge_filter = 0.1
    repair_filter = 0.9
    union_threshold = 0.05
    big_border = 0.3

    graph_file = open(filename, mode="r")


### Preprocessing ###
# Knotengrade berechnen je Knoten (Scan über alle Kanten)
    node_dgr = np.zeros(n, dtype=np.int64)

    for line in graph_file:
        # Kommentar-Zeilen überspringen
        if line[0] == "#":
            continue
        splitted = line.split()
        nodes = np.array(splitted[:-1], dtype=np.int64)
        weight = np.float64(splitted[2])

        # Falls Kante 'existiert' nach Threshold:
        if weight > 0:
            node_dgr[nodes[0]] += 1
            node_dgr[nodes[1]] += 1

# Sequentiell für alle Lösungen (alle UF-Strukturen gleichzeitig, oder zumindest so viele wie passen):
# Größe einer Lösung: Array mit n Einträgen aus je 64bit
### Generate Solutions ###
    parents = np.full((x,n), np.arange(n, dtype=np.int64))
    sizes = np.ones((x,n), dtype=np.int64)
    # Modellparameter einlesen:
    parameters_b = load_model_flexible_v2('params_below_100.csv')
    parameters_a = load_model_flexible_v2('params_above_100.csv')
    #cluster_count = np.full(x, n, dtype=np.int64)
    # Alle Parameter für die Modelle festlegen:
    cluster_model = np.full(x,17)
    def generate_solutions(first, c_opt):
        if first:
            k = int(x/37)
            j = 0
            c = 0

            for i in range(0,x):
                cluster_model[i] = c
                j += 1
                if j == k and c < 36:
                    c += 1
                    j = 0
        if not first:
            # Überschreibe Lösungen mit nicht-optimalem Parameter um danach neue zu generieren
            for i in range(0,x):
                if cluster_model[i] != c_opt:
                    parents[i] = np.arange(n, dtype=np.int64)
                    sizes[i] = np.ones(n, dtype = np.int64)

    # 2. Scan über alle Kanten: Je Kante samplen in UF-Strukturen
        graph_file = open(filename, mode="r")

        for line in graph_file:
            # Kommentar-Zeilen überspringen
            if line[0] == "#":
                continue
            splitted = line.split()
            nodes = np.array(splitted[:-1], dtype=np.int64)
            weight = np.float64(splitted[2])

            guess_n = (node_dgr[nodes[0]] + node_dgr[nodes[1]]) / 2

            decision_values = rand.rand(x)
            for i in range(0, x):
                if not first:
                    if cluster_model[i] == c_opt:
                        # Ändere in 2. Lauf nichts an den Lösungen, die bereits gut sind!
                        continue
            # Samplingrate ermitteln
                sampling_rate = model_flexible_v2(parameters_b, parameters_a, guess_n, cluster_model[i])
                # Falls Kante gesamplet...
                if decision_values[i] < sampling_rate:
                    # ...füge Kante ein in UF-Struktur
                    union(nodes[0], nodes[1], parents[i], sizes[i])

    generate_solutions(True, 0)


### Solution Assessment ###
# Nachbearbeitung aller Lösungen: Flache Struktur (= Knoten in selbem Cluster haben selben Eintrag im Array)
# Und Berechnung benötigter Kanten je Cluster (n_c * (n_c-1) / 2) pro UF

    def calculate_costs(solutions_parents, x, merged):
        if merged:
            inner_sizes = merged_sizes
        else:
            inner_sizes = sizes
        solution_costs = np.zeros(x, dtype=np.float64)
        vertex_costs = np.zeros((x,n), dtype=np.float64)
        c_edge_counter = np.zeros((x,n), dtype=np.int64)

        for i in range(x):
            for j in range(n):
                root = flattening_find(j,solutions_parents[i])
                n_c = inner_sizes[i, root]
                c_edge_counter[i, j] = n_c - 1

        # 3. Scan über alle Kanten: Kostenberechnung für alle Lösungen (Gesamtkosten und Clusterkosten)
        graph_file = open(filename, mode="r")

        for line in graph_file:
            # Kommentar-Zeilen überspringen
            if line[0] == "#":
                continue
            splitted = line.split()
            nodes = np.array(splitted[:-1], dtype=np.int64)
            weight = np.float64(splitted[2])

            for i in range(0,x):
                if not merged:
                    root1 = find(nodes[0],solutions_parents[i])
                    root2 = find(nodes[1],solutions_parents[i])
                else:
                    root1 = solutions_parents[i, nodes[0]]
                    root2 = solutions_parents[i, nodes[1]]
                # Kante zwischen zwei Clustern
                if root1 != root2:
                    # mit positivem Gewicht (zu viel)
                    if weight > 0:
                        vertex_costs[i, nodes[0]] += weight / 2
                        vertex_costs[i, nodes[1]] += weight / 2
                        solution_costs[i] += weight
                # Kante innerhalb von Cluster
                else:
                    # mit negativem Gewicht (fehlt)
                    if weight < 0:
                        vertex_costs[i, nodes[0]] -= weight / 2
                        vertex_costs[i, nodes[1]] -= weight / 2
                        solution_costs[i] -= weight
                    c_edge_counter[i, nodes[0]] -= 1
                    c_edge_counter[i, nodes[1]] -= 1
                    #print("missing edges for now: ", c_edge_counter[i][root1])

        for i in range(0,x):
            # über Cluster(-Repräsentanten, Keys) iterieren:
            for j in range(n):
                missing_edges = c_edge_counter[i, j]
                if missing_edges > 0:
                    # Kosten für komplett fehlende Kanten zur Lösung addieren
                    vertex_costs[i, j] += missing_edges * (-missing_weight) * 0.5
                    solution_costs[i] += missing_edges * (-missing_weight) * 0.5 # Zwei Knoten innerhalb eines Clusters vermissen die selbe Kante, daher *0.5 bei Berechnung über die Knoten
        return (vertex_costs, solution_costs)
    costs = calculate_costs(parents, x, False)
    vertex_costs = costs[0]
    solution_costs = costs[1]

### Solution Merge ###

# Mithilfe der Bewertungen/Kosten Lösungen sinnvoll mergen/reparieren

    mean_costs_c = np.zeros(37, dtype=np.float64)
    c_count = np.zeros(37, dtype= np.int64)
    # Summierte Kosten für selben Parameter
    for i in range(x):
        c = cluster_model[i]
        mean_costs_c[c] = mean_costs_c[c] + solution_costs[i]
        c_count[c] += 1
    # Teilen durch Anzahl Lösungen mit dem Parameter
    for i in range(37):
        mean_costs_c[i] = mean_costs_c[i]/c_count[i]
    # c_opt ist Parameter mit geringsten Durchschnittskosten der Lösungen
    c_opt = np.argsort(mean_costs_c)[0]
    print_result(output_path, "c_opt_v4.txt", c_opt)
    generate_solutions(False, c_opt)
    costs = calculate_costs(parents, x, False)
    vertex_costs = costs[0]
    solution_costs = costs[1]
    # Optimierung: Filtern der "besten" Lösungen, um eine solidere Basis für den Merge zu schaffen.
    top_percent = range(np.int64(x*merge_filter))
    mid_percent = range(np.int64(x*repair_filter))
    cost_sorted_i = np.argsort(solution_costs)
    good_costs_i = cost_sorted_i[top_percent]
    mid_costs_i = cost_sorted_i[mid_percent]
    # Artefakt aus Zeit mit n_merges > 1; sonst inkompatibel mit calculate_costs.
    merged_solutions = np.full((1,n), np.arange(n, dtype=np.int64))
    final_solutions = np.full((1,n), np.arange(n, dtype=np.int64))
    merged_sizes = np.full((1,n), np.zeros(n, dtype=np.int64))
    merged = merged_solution_scan(solution_costs[good_costs_i], vertex_costs[good_costs_i], parents[good_costs_i], sizes[good_costs_i], missing_weight, n, filename, output_path, union_threshold)
    merged_save = np.copy(merged[0])
    merged_solutions[0] = merged[0]
    merged_sizes[0] = merged[1]
    merged_c = calculate_costs(merged_solutions, 1, True)
    merged_costs = merged_c[1]
    merged_vc = merged_c[0]
    #merged_to_file(merged_solutions, merged_costs, filename, missing_weight, n, len(good_costs_i), 1)
    print_result(output_path, "merged_cost_v4.txt", merged_costs[0])
    print_zhk(output_path, merged_solutions[0], merged_sizes[0])
    # Glätten der Lösung falls Baumstruktur auftritt
    for j in range(0,n):
        flattening_find(j, merged_solutions[0])
    #rep = repair_merged(merged_solutions[i], merged_sizes[i], solution_costs, vertex_costs, parents, sizes, n, node_dgr)
    rep = repair_merged_v4_scan(merged_solutions[0], merged_sizes[0], solution_costs[mid_costs_i], vertex_costs[mid_costs_i], parents[mid_costs_i], sizes[mid_costs_i], n, node_dgr, big_border, filename)
    merged_solutions[0] = rep[0]
    merged_sizes[0] = rep[1]
    # Sicherheitshalber noch mal glätten für Lösungsberechnung:
    for j in range(0,n):
        flattening_find(j, merged_solutions[0])
    rep_c = calculate_costs(merged_solutions, 1, True)
    merged_costs = rep_c[1]
    rep_vc = rep_c[0]
    print_result(output_path, "rep_v4.txt", merged_costs[0])
    mr_3 = undo_merge_repair(merged_save, rep[0], merged_vc[0], rep_vc[0])
    final_solutions[0] = mr_3[0]
    merged_sizes[0] = mr_3[1]
    final_costs = calculate_costs(final_solutions, 1, True)
    print_result(output_path, "final_v4.txt", final_costs[1][0])
    # Da Merge-Repair auf weniger Lösungen basiert, nur diese angeben:
    x2 = len(mid_costs_i)
    #merged_to_file(merged_solutions, merged_costs, filename, missing_weight, n, x2, 1)
    merged_to_file(final_solutions, final_costs[1], filename, missing_weight, n, x2, 1, output_path)
    #all_solutions(solution_costs[good_costs_i], parents[good_costs_i], filename, missing_weight, n)
    #print_solution_costs(solution_costs[good_costs_i], filename)
    #merged_short_print(merged_solutions, merged_costs, filename, missing_weight, n, x2, 1)

@njit
def check_if_flat(solution):
    for i in range(len(solution)):
        # Prüfe, ob Knoten i Wurzel ist oder Kind 1. Ebene
        if solution[i] != i and solution[solution[i]] != solution[i]:
            # falls es beides nicht ist, ist der Baum nicht flach!
            return False
    return True