-
Notifications
You must be signed in to change notification settings - Fork 0
/
uf_cluster_editing_v4.py
261 lines (231 loc) · 11.5 KB
/
uf_cluster_editing_v4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
from union_find import *
from math import log
import sys
import numpy as np
from numba import njit, jit
from numpy import random as rand
from model_sqrt import *
from merging_methods_v4_slim import *
import csv
"""
This module implements a cluster editing algorithm. It uses a semi-streaming approach and is therefore able to process files that would be too big for main memory. This is the classical version of uf_ce, there is a second approach available using rem's algorithm instead of union-find.
"""
# Input sollte aus je 3 mit Leerzeichen getrennten Einträgen pro Zeile bestehen:
# <Nummer Knoten 1> <Nummer Knoten 2> <Gewicht der Kante>
# Die Knotenbezeichnungen sind (von 0 bis n-1) numpy.int64, die Gewichte numpy.float64
# missing_weight: Gewicht für fehlende Kanten (die nicht in der Datei enthalten sind)
# n: Anzahl Objekte/Knoten
# x: Anzahl generierter Lösungen (mehr = besser, aber teurer in Speicher/Laufzeit)
def unionfind_cluster_editing(filename, output_path, missing_weight, n, x):
"""
This is a cluster editing algorithm, based on semi-streaming approach using union find to analyze graph structures.
The input file should contain edges in format
<np.int64: edge 1> <np.int64: edge 2> <np.float64: edge weight>\n
Parameter missing_weight sets a weight for edges that are not contained in the file (for unweighted data: -1)
Parameter n gives the number of objects (nodes)
Parameter x is the number of generated solutions (which are the basis for a merged solution). It merely influences running time, however with limited memory it should not be chosen too high. 300-1k is recommended, the more the better.
"""
merge_filter = 0.1
repair_filter = 0.9
union_threshold = 0.05
big_border = 0.3
graph_file = open(filename, mode="r")
### Preprocessing ###
# Knotengrade berechnen je Knoten (Scan über alle Kanten)
node_dgr = np.zeros(n, dtype=np.int64)
for line in graph_file:
# Kommentar-Zeilen überspringen
if line[0] == "#":
continue
splitted = line.split()
nodes = np.array(splitted[:-1], dtype=np.int64)
weight = np.float64(splitted[2])
# Falls Kante 'existiert' nach Threshold:
if weight > 0:
node_dgr[nodes[0]] += 1
node_dgr[nodes[1]] += 1
# Sequentiell für alle Lösungen (alle UF-Strukturen gleichzeitig, oder zumindest so viele wie passen):
# Größe einer Lösung: Array mit n Einträgen aus je 64bit
### Generate Solutions ###
parents = np.full((x,n), np.arange(n, dtype=np.int64))
sizes = np.ones((x,n), dtype=np.int64)
# Modellparameter einlesen:
parameters_b = load_model_flexible_v2('params_below_100.csv')
parameters_a = load_model_flexible_v2('params_above_100.csv')
#cluster_count = np.full(x, n, dtype=np.int64)
# Alle Parameter für die Modelle festlegen:
cluster_model = np.full(x,17)
def generate_solutions(first, c_opt):
if first:
k = int(x/37)
j = 0
c = 0
for i in range(0,x):
cluster_model[i] = c
j += 1
if j == k and c < 36:
c += 1
j = 0
if not first:
# Überschreibe Lösungen mit nicht-optimalem Parameter um danach neue zu generieren
for i in range(0,x):
if cluster_model[i] != c_opt:
parents[i] = np.arange(n, dtype=np.int64)
sizes[i] = np.ones(n, dtype = np.int64)
# 2. Scan über alle Kanten: Je Kante samplen in UF-Strukturen
graph_file = open(filename, mode="r")
for line in graph_file:
# Kommentar-Zeilen überspringen
if line[0] == "#":
continue
splitted = line.split()
nodes = np.array(splitted[:-1], dtype=np.int64)
weight = np.float64(splitted[2])
guess_n = (node_dgr[nodes[0]] + node_dgr[nodes[1]]) / 2
decision_values = rand.rand(x)
for i in range(0, x):
if not first:
if cluster_model[i] == c_opt:
# Ändere in 2. Lauf nichts an den Lösungen, die bereits gut sind!
continue
# Samplingrate ermitteln
sampling_rate = model_flexible_v2(parameters_b, parameters_a, guess_n, cluster_model[i])
# Falls Kante gesamplet...
if decision_values[i] < sampling_rate:
# ...füge Kante ein in UF-Struktur
union(nodes[0], nodes[1], parents[i], sizes[i])
generate_solutions(True, 0)
### Solution Assessment ###
# Nachbearbeitung aller Lösungen: Flache Struktur (= Knoten in selbem Cluster haben selben Eintrag im Array)
# Und Berechnung benötigter Kanten je Cluster (n_c * (n_c-1) / 2) pro UF
def calculate_costs(solutions_parents, x, merged):
if merged:
inner_sizes = merged_sizes
else:
inner_sizes = sizes
solution_costs = np.zeros(x, dtype=np.float64)
vertex_costs = np.zeros((x,n), dtype=np.float64)
c_edge_counter = np.zeros((x,n), dtype=np.int64)
for i in range(x):
for j in range(n):
root = flattening_find(j,solutions_parents[i])
n_c = inner_sizes[i, root]
c_edge_counter[i, j] = n_c - 1
# 3. Scan über alle Kanten: Kostenberechnung für alle Lösungen (Gesamtkosten und Clusterkosten)
graph_file = open(filename, mode="r")
for line in graph_file:
# Kommentar-Zeilen überspringen
if line[0] == "#":
continue
splitted = line.split()
nodes = np.array(splitted[:-1], dtype=np.int64)
weight = np.float64(splitted[2])
for i in range(0,x):
if not merged:
root1 = find(nodes[0],solutions_parents[i])
root2 = find(nodes[1],solutions_parents[i])
else:
root1 = solutions_parents[i, nodes[0]]
root2 = solutions_parents[i, nodes[1]]
# Kante zwischen zwei Clustern
if root1 != root2:
# mit positivem Gewicht (zu viel)
if weight > 0:
vertex_costs[i, nodes[0]] += weight / 2
vertex_costs[i, nodes[1]] += weight / 2
solution_costs[i] += weight
# Kante innerhalb von Cluster
else:
# mit negativem Gewicht (fehlt)
if weight < 0:
vertex_costs[i, nodes[0]] -= weight / 2
vertex_costs[i, nodes[1]] -= weight / 2
solution_costs[i] -= weight
c_edge_counter[i, nodes[0]] -= 1
c_edge_counter[i, nodes[1]] -= 1
#print("missing edges for now: ", c_edge_counter[i][root1])
for i in range(0,x):
# über Cluster(-Repräsentanten, Keys) iterieren:
for j in range(n):
missing_edges = c_edge_counter[i, j]
if missing_edges > 0:
# Kosten für komplett fehlende Kanten zur Lösung addieren
vertex_costs[i, j] += missing_edges * (-missing_weight) * 0.5
solution_costs[i] += missing_edges * (-missing_weight) * 0.5 # Zwei Knoten innerhalb eines Clusters vermissen die selbe Kante, daher *0.5 bei Berechnung über die Knoten
return (vertex_costs, solution_costs)
costs = calculate_costs(parents, x, False)
vertex_costs = costs[0]
solution_costs = costs[1]
### Solution Merge ###
# Mithilfe der Bewertungen/Kosten Lösungen sinnvoll mergen/reparieren
mean_costs_c = np.zeros(37, dtype=np.float64)
c_count = np.zeros(37, dtype= np.int64)
# Summierte Kosten für selben Parameter
for i in range(x):
c = cluster_model[i]
mean_costs_c[c] = mean_costs_c[c] + solution_costs[i]
c_count[c] += 1
# Teilen durch Anzahl Lösungen mit dem Parameter
for i in range(37):
mean_costs_c[i] = mean_costs_c[i]/c_count[i]
# c_opt ist Parameter mit geringsten Durchschnittskosten der Lösungen
c_opt = np.argsort(mean_costs_c)[0]
print_result(output_path, "c_opt_v4.txt", c_opt)
generate_solutions(False, c_opt)
costs = calculate_costs(parents, x, False)
vertex_costs = costs[0]
solution_costs = costs[1]
# Optimierung: Filtern der "besten" Lösungen, um eine solidere Basis für den Merge zu schaffen.
top_percent = range(np.int64(x*merge_filter))
mid_percent = range(np.int64(x*repair_filter))
cost_sorted_i = np.argsort(solution_costs)
good_costs_i = cost_sorted_i[top_percent]
mid_costs_i = cost_sorted_i[mid_percent]
# Artefakt aus Zeit mit n_merges > 1; sonst inkompatibel mit calculate_costs.
merged_solutions = np.full((1,n), np.arange(n, dtype=np.int64))
final_solutions = np.full((1,n), np.arange(n, dtype=np.int64))
merged_sizes = np.full((1,n), np.zeros(n, dtype=np.int64))
merged = merged_solution_scan(solution_costs[good_costs_i], vertex_costs[good_costs_i], parents[good_costs_i], sizes[good_costs_i], missing_weight, n, filename, output_path, union_threshold)
merged_save = np.copy(merged[0])
merged_solutions[0] = merged[0]
merged_sizes[0] = merged[1]
merged_c = calculate_costs(merged_solutions, 1, True)
merged_costs = merged_c[1]
merged_vc = merged_c[0]
#merged_to_file(merged_solutions, merged_costs, filename, missing_weight, n, len(good_costs_i), 1)
print_result(output_path, "merged_cost_v4.txt", merged_costs[0])
print_zhk(output_path, merged_solutions[0], merged_sizes[0])
# Glätten der Lösung falls Baumstruktur auftritt
for j in range(0,n):
flattening_find(j, merged_solutions[0])
#rep = repair_merged(merged_solutions[i], merged_sizes[i], solution_costs, vertex_costs, parents, sizes, n, node_dgr)
rep = repair_merged_v4_scan(merged_solutions[0], merged_sizes[0], solution_costs[mid_costs_i], vertex_costs[mid_costs_i], parents[mid_costs_i], sizes[mid_costs_i], n, node_dgr, big_border, filename)
merged_solutions[0] = rep[0]
merged_sizes[0] = rep[1]
# Sicherheitshalber noch mal glätten für Lösungsberechnung:
for j in range(0,n):
flattening_find(j, merged_solutions[0])
rep_c = calculate_costs(merged_solutions, 1, True)
merged_costs = rep_c[1]
rep_vc = rep_c[0]
print_result(output_path, "rep_v4.txt", merged_costs[0])
mr_3 = undo_merge_repair(merged_save, rep[0], merged_vc[0], rep_vc[0])
final_solutions[0] = mr_3[0]
merged_sizes[0] = mr_3[1]
final_costs = calculate_costs(final_solutions, 1, True)
print_result(output_path, "final_v4.txt", final_costs[1][0])
# Da Merge-Repair auf weniger Lösungen basiert, nur diese angeben:
x2 = len(mid_costs_i)
#merged_to_file(merged_solutions, merged_costs, filename, missing_weight, n, x2, 1)
merged_to_file(final_solutions, final_costs[1], filename, missing_weight, n, x2, 1, output_path)
#all_solutions(solution_costs[good_costs_i], parents[good_costs_i], filename, missing_weight, n)
#print_solution_costs(solution_costs[good_costs_i], filename)
#merged_short_print(merged_solutions, merged_costs, filename, missing_weight, n, x2, 1)
@njit
def check_if_flat(solution):
for i in range(len(solution)):
# Prüfe, ob Knoten i Wurzel ist oder Kind 1. Ebene
if solution[i] != i and solution[solution[i]] != solution[i]:
# falls es beides nicht ist, ist der Baum nicht flach!
return False
return True