def get_model_data(model_type, model_name, compression_options=None): from preconvert.output import json if model_name in data_cache: return data_cache[model_type + "_" + model_name] """ Gets the model data """ try: data = pkgutil.get_data("data", "shrynk/{}_{}.jsonl.gzip".format(model_type, model_name)) data = [ json.loads(line) for line in decompress(data).decode("utf8").split("\n") if line.strip() ] # print("from package") except FileNotFoundError: try: with open(shrynk_path("{}_{}.jsonl".format(model_type, model_name))) as f: data = [json.loads(x) for x in f.read().split("\n") if x] except FileNotFoundError: data = [] if compression_options is not None: known_kwargs = set([json.dumps(x) for x in compression_options]) for x in data: x["bench"] = [y for y in x["bench"] if y["kwargs"] in known_kwargs] # print("filtered compressions") data_cache[model_type + "_" + model_name] = data return data
def append(obj, fn): from preconvert.output import json if not isinstance(fn, str): raise TypeError("Cannot append to compression") with open(fn, "a+") as f: f.write(json.dumps(obj) + "\n")
def iwrite(obj, fn): from preconvert.output import json if not isinstance(fn, str): raise TypeError("Cannot iteratively write compressed") with open(fn, "w") as f: for chunk in obj: f.write(json.dumps(chunk) + "\n")
def write(obj, fn): from preconvert.output import json if not isinstance(fn, str): fn.write(bytes(json.dumps(obj), encoding="utf8")) else: with open(fn, "w") as f: json.dump(obj, f, indent=4)
def get_cache_file_name(domain, request_info, compression=".gz"): from preconvert.output import json key = json.dumps(request_info) m = hashlib.md5() m.update(key.encode("utf8")) md5 = m.hexdigest() dir_name, fname = md5[:3], md5[3:] if not compression: compression = "" return f"~/.just_requests/{domain}/{dir_name}/{fname}.json{compression}"
def run_benchmarks(self, data_generator, ignore_seen=True, timeout=300, save=True, verbose=True): from preconvert.output import json if self.model_data is None: self.model_data = get_model_data(self.model_type, self.model_name, self.compression_options) feature_ids = set([x["feature_id"] for x in self.model_data]) results = [] index = [] if isinstance(data_generator, (str, pd.DataFrame, dict)): data_generator = [data_generator] for num, df in enumerate(data_generator): df, status = self.cast_to_data(df) if df is None: print(status) continue stat_computation_time = time.time() try: features = self.get_features(df) except self.bench_exceptions: continue if features is None: continue feature_id = md5(features) if ignore_seen and feature_id in feature_ids: print("seen", feature_id) continue stat_computation_time = time.time() - stat_computation_time result = { "feature_id": feature_id, "features": features, "class": self.__class__.__name__, "stat_computation_time": stat_computation_time, } bench = self.benchmark(df, timeout, verbose=verbose) result["bench"] = bench if bench: model_data_path = shrynk_path("{}_{}.jsonl".format( self.model_type, self.model_name)) self.model_data.append(result) if save: with open(model_data_path, "a") as f: f.write(json.dumps(result) + "\n") results.append(result) feature_ids.add(feature_id) ### run benchmarks should return a total overview or something # return pd.DataFrame(bench).set_index("kwargs") return results
def benchmark(self, df, timeout=300, verbose=False): from preconvert.output import json bench = [] for kwargs in self.compression_options: size, write_time, read_time = self.single_benchmark( df, kwargs, timeout) if size is None: # write_error(line) print("error, skipping", kwargs) continue if verbose: print(kwargs, size, write_time, read_time) bench.append({ "kwargs": json.dumps(kwargs), "size": size, "write_time": write_time, "read_time": read_time, }) return bench
def md5(features): from preconvert.output import json return hashlib.md5(json.dumps(features, sort_keys=True).encode()).hexdigest()
def get_benchmark_html(df, fname): features = pdc.get_features(df) bench_res = None save = False if IN_PRODUCTION: blob = get_blob(features) if blob.exists(): results = json.loads(blob.download_as_string()) bench_res = results["bench"] else: results = pdc.run_benchmarks(df, save=False, ignore_seen=False, timeout=False)[0] # make a copy not to pop kwargs from results object which will be saved bench_res = deepcopy(results)["bench"] save = True else: bench_res = pdc.run_benchmarks(df, save=False, ignore_seen=False, timeout=False)[0]["bench"] kwargs = [x.pop("kwargs") for x in bench_res] bench_res = pd.DataFrame(bench_res, index=kwargs) inferred = pdc.infer(features) z_name = "z {}".format(tuple(weights)) bench_res[z_name] = (scale(bench_res) * weights).sum(axis=1) bench_res = bench_res.round(5).sort_values(z_name) bench_res = bench_res[[z_name, "size", "write_time", "read_time"]] y = json.dumps(inferred) res_index = [i + 1 for i, x in enumerate(bench_res.index) if x == y] + [-1] if save: ip = request.environ.get("HTTP_X_FORWARDED_FOR", "") ip = ip.split(",")[0] results["web"] = { "utctime": datetime.utcnow().isoformat(), "ip": ip, "predicted": inferred, "res_index": res_index[0], # 1 is 1st, 2 is 2nd "filename": fname, "weights": weights.tolist(), } blob.upload_from_string(json.dumps(results)) print("saved blob") bench_res.index = [ " ".join(["{}={!r}".format(k, v) for k, v in json.loads(x).items()]) for x in bench_res.index ] learning = "none" if res_index and res_index[0] == 1 else "inherit" nth = { 1: "1st", 2: "2nd", 3: "3rd", -1: "999" }.get(res_index[0], str(res_index[0]) + "th") # upload(features, "{}-{}".format(file.filename, time.time())) features = { k.replace("quantile_proportion", "quantile"): round(v, 3) if isinstance(v, float) else v for k, v in features.items() } return str( Markup( '<center> <h5 class="tagline"> Results: </h5></center>' + '<div class="container" style="margin-top: 2rem"><div class="row">' + '<div class="col l10 offset-l2" style="padding-bottom: 2rem; padding-top: 1rem;">The data was featurized, and a prediction was made. Then, all the compressions were ran for this file so we can see if the prediction was correct (the ground truth).</div>' + '<div class="col s12 m6 l3 offset-l2">' + "<b>Filename: </b>" + fname + "<br><b>Features: </b>" + '<code class="codes">' + json.dumps(features, indent=4) + "</code>" + '</div>' + '<div class="col s12 m6 l3 offset-l3">' + "<br><center style='line-height: 3'><b>Predicted: </b><br>" # just using features here instead of data to be faster + " ".join(["{}={!r}".format(k, v) for k, v in inferred.items()]) + "<br><b>Result:</b><br><span class='result {}'>{}</span> / {}<br><div style='display: {}'><span style='color: #ee6e73'>Wrong!</span> We will learn from this...</div>" .format(nth[-2:], nth, bench_res.shape[0], learning) + "</center></div></div>" + "<center><h4>Ground truth</h4><div class='show-on-small hide-on-med-and-up' style='padding: 0.5rem; color: grey'> -- scroll -> </center>" + replacenth( format_res(bench_res, tuple(weights), fname), "<tr ", '<tr class="resultinv {}" '.format(nth[-2:]), int(nth[:-2]), )))
def run_benchmarks(self, data_generator, save=True, ignore_seen=True, timeout=300): from preconvert.output import json model_path = os.path.expanduser("~/shrynk_{}.jsonl".format( self.model_name)) if self.model_data is None: self.model_data = get_model_data(self.model_name, self.compression_options) feature_ids = set([x["feature_id"] for x in self.model_data]) results = [] index = [] if isinstance(data_generator, pd.DataFrame): data_generator = [data_generator] for num, df in enumerate(data_generator): if isinstance(df, str) and os.path.isfile(df): try: df = self.load(df) except self.bench_exceptions: continue if df is None: continue if isinstance(df, str): continue stat_computation_time = time.time() features = self.get_features(df) if features is None: continue feature_id = md5(features) if ignore_seen and feature_id in feature_ids: print("seen", feature_id) continue stat_computation_time = time.time() - stat_computation_time result = { "feature_id": feature_id, "features": features, "class": self.__class__.__name__, "stat_computation_time": stat_computation_time, } bench = [] for kwargs in self.compression_options: size, write_time, read_time = self.benchmark( df, kwargs, timeout) if size is None: # write_error(line) print("error, skipping", kwargs) continue print(kwargs, size, write_time, read_time) bench.append({ "kwargs": json.dumps(kwargs), "size": size, "write_time": write_time, "read_time": read_time, }) result["bench"] = bench if bench: self.model_data.append(result) if save: with open(model_path, "a") as f: f.write(json.dumps(result) + "\n") results.append(result) feature_ids.add(feature_id) ### run benchmarks should return a total overview or something, but now just from the last df # return pd.DataFrame(bench).set_index("kwargs") return results