def get_jaccard_distribution(pf, args, start_t): """Samples random pairs and saves their Jaccard similarity to CSV. The result is used by jaccard_distribution.R to fit an distribution to the data. """ csv = CsvWriter(args.results, append=True) csv.write_header(['u1', 'u2', 'jac_sim', 'sig_sim']) for i, (u1, u2) in enumerate( zip(np.random.permutation(pf.n_docs), np.random.permutation(pf.n_docs))): if u1 == u2: next print("Wrote {} similarities".format(i), end='\r') csv.write([u1, u2, pf.jaccard_similarity(u1, u2), pf.sig_sim(u1, u2)]) return True
def get_candidate_distribution(pf, args, start_t): """Samples pairs from candidates, stratified by signature similarity. The result is stored in a CSV, including the signature similarity, Jaccard similarity, and weight (> 1 if there were more samples in the stratum than max_per_step, to get approximately accurate frequencies). """ candidates = list(pf.candidates()) csv_file = args.results csv = CsvWriter(csv_file, append=True) csv.write_header([ 'run_id', 'u1', 'u2', 'sig_sim', 'jac_sim', 'sig_len', 'bands', 'max_buckets', 'used_buckets', 'weight' ]) run_id = datetime.now().isoformat() lim = 0 step = 0.05 max_per_step = 100 while lim < 1: cand = [(c, sim) for c, sim in candidates if sim >= lim and sim < lim + step] n = len(cand) print("{} candidates between {} and {}".format(n, lim, lim + step)) weight = max(n, max_per_step) / max_per_step for i in np.random.permutation(n)[:max_per_step]: (c1, c2), sim = cand[i] csv.write([ run_id, c1, c2, sim, pf.jaccard_similarity(c1, c2), pf.sig_len, pf.n_bands, pf.max_buckets, len(pf.buckets), weight ]) lim += step return True
import time import subprocess import os import gc import data from csv_writer import CsvWriter from util import ensure_directory """ This file simulates the evaluation environment and stores results as a CSV. """ csv = CsvWriter("diagnostics/out/evaluation.csv", append=True) csv.write_header([ 'note', 'batch', 'run', 'found', 'incorrect', 'time', 'ppm', 'terminated' ]) def jaccard_sim(data, u1, u2): m1 = data.movie[data.user == u1] m2 = data.movie[data.user == u2] return len(np.intersect1d(m1, m2)) / len(np.union1d(m1, m2)) def run_evaluation(note, batch=0, runs=5): for run in range(runs): cmd = [ "python3", "main.py", "--rows",
import numpy as np import multiprocessing as mp from sklearn.model_selection import ParameterGrid from datetime import datetime import time import subprocess import os from csv_writer import CsvWriter from util import ensure_directory csv = CsvWriter("diagnostics/out/experiments.csv", append=True) csv.write_header( ['batch_id', 'run_id', 'bands', 'rows', 'max_buckets', 'time', 'count']) batch_id = datetime.now().isoformat() def count_lines_in_file(filename): if not os.path.exists(filename): return 0 count = 0 with open(filename) as f: for l in f: if l.strip() != '': count += 1 return count def perform(params):