def evaluate_cluster(data_path, model_path, real_path, pred_path):
    dataset = Dataset.from_csv(data_path)
    rows = dataset.rows
    clf = load_model(model_path)
    clusters = cluster_by_classifier(rows, clf)

    write_clusters(clusters, pred_path)
    precision, recall, f1 = val(pred_path, real_path)
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'f1: {f1}')
    def _prepare(record_filename, data_filename):
        dataset = Dataset.from_csv(record_filename)
        clusters = list(dataset.rows_oc.values())
        all_positive_count = sum(len(v) * (len(v) - 1) / 2 for v in clusters)
        all_count = int(all_positive_count / positive_rate * data_rate)
        n_entities = len(clusters)
        n_positive = n_negative = 0

        with open(data_filename, 'w') as data_out:
            for i in range(all_count):
                if i % 1000 == 0:
                    print(f'[{data_filename}] Process: {i}')
                if random.random() < positive_rate:
                    label = 1
                    while True:
                        e = random.randint(0, n_entities - 1)
                        if len(clusters[e]) > 1:
                            break
                    items = random.sample(clusters[e], 2)
                    assert items[0].cuid == items[1].cuid
                    n_positive += 1
                else:
                    label = 0
                    items = [
                        x[random.randint(0,
                                         len(x) - 1)]
                        for x in random.sample(clusters, 2)
                    ]
                    assert items[0].cuid != items[1].cuid
                    n_negative += 1
                feature = get_similarity(*items)
                data_out.write(
                    f'{items[0].ruid} {items[1].ruid} {label} {" ".join(map(str, feature))}\n'
                )
        print(
            f'[{data_filename}] positive: {n_positive}, negative: {n_negative}, total: {n_positive + n_negative}'
        )

def fit_linear(dataset: Dataset,
               lambdaa=0.1,
               max_iter=10000,
               threshold=0.11) -> Tuple[Vector, float, int]:
    rho = Vector(*[i for i in range(len(dataset.features))])
    error = sqr_error(dataset, rho)
    prev_error = 0
    iter_count = 0
    while iter_count < max_iter and (prev_error == 0
                                     or abs(prev_error - error) > threshold):
        rho = rho - lambdaa * gradient(dataset, rho)
        prev_error, error = error, sqr_error(dataset, rho)
        iter_count += 1
    return rho, error, iter_count


if __name__ == "__main__":
    csv_str = """V_lead,V_iron,V_aluminium,mass
0.3,0.2,0.1,5.246
0.1,0.1,0.4,3.001
0.7,0.3,0.5,11.649
0.4,0.6,0.11,9.5574"""

    rho, sqr_err, nb_iter = fit_linear(Dataset.from_csv(csv_str))

    print("Densities:", rho)
    print("Nb iterations:", nb_iter)
    print("Square error:", sqr_err)