def evaluate_cluster(data_path, model_path, real_path, pred_path): dataset = Dataset.from_csv(data_path) rows = dataset.rows clf = load_model(model_path) clusters = cluster_by_classifier(rows, clf) write_clusters(clusters, pred_path) precision, recall, f1 = val(pred_path, real_path) print(f'precision: {precision}') print(f'recall: {recall}') print(f'f1: {f1}')
def _prepare(record_filename, data_filename): dataset = Dataset.from_csv(record_filename) clusters = list(dataset.rows_oc.values()) all_positive_count = sum(len(v) * (len(v) - 1) / 2 for v in clusters) all_count = int(all_positive_count / positive_rate * data_rate) n_entities = len(clusters) n_positive = n_negative = 0 with open(data_filename, 'w') as data_out: for i in range(all_count): if i % 1000 == 0: print(f'[{data_filename}] Process: {i}') if random.random() < positive_rate: label = 1 while True: e = random.randint(0, n_entities - 1) if len(clusters[e]) > 1: break items = random.sample(clusters[e], 2) assert items[0].cuid == items[1].cuid n_positive += 1 else: label = 0 items = [ x[random.randint(0, len(x) - 1)] for x in random.sample(clusters, 2) ] assert items[0].cuid != items[1].cuid n_negative += 1 feature = get_similarity(*items) data_out.write( f'{items[0].ruid} {items[1].ruid} {label} {" ".join(map(str, feature))}\n' ) print( f'[{data_filename}] positive: {n_positive}, negative: {n_negative}, total: {n_positive + n_negative}' )
def fit_linear(dataset: Dataset, lambdaa=0.1, max_iter=10000, threshold=0.11) -> Tuple[Vector, float, int]: rho = Vector(*[i for i in range(len(dataset.features))]) error = sqr_error(dataset, rho) prev_error = 0 iter_count = 0 while iter_count < max_iter and (prev_error == 0 or abs(prev_error - error) > threshold): rho = rho - lambdaa * gradient(dataset, rho) prev_error, error = error, sqr_error(dataset, rho) iter_count += 1 return rho, error, iter_count if __name__ == "__main__": csv_str = """V_lead,V_iron,V_aluminium,mass 0.3,0.2,0.1,5.246 0.1,0.1,0.4,3.001 0.7,0.3,0.5,11.649 0.4,0.6,0.11,9.5574""" rho, sqr_err, nb_iter = fit_linear(Dataset.from_csv(csv_str)) print("Densities:", rho) print("Nb iterations:", nb_iter) print("Square error:", sqr_err)