Exemple #1
0
def get_model_data(model_type, model_name, compression_options=None):
    from preconvert.output import json

    if model_name in data_cache:
        return data_cache[model_type + "_" + model_name]
    """ Gets the model data """
    try:
        data = pkgutil.get_data("data", "shrynk/{}_{}.jsonl.gzip".format(model_type, model_name))
        data = [
            json.loads(line) for line in decompress(data).decode("utf8").split("\n") if line.strip()
        ]
        # print("from package")
    except FileNotFoundError:
        try:
            with open(shrynk_path("{}_{}.jsonl".format(model_type, model_name))) as f:
                data = [json.loads(x) for x in f.read().split("\n") if x]
        except FileNotFoundError:
            data = []
    if compression_options is not None:
        known_kwargs = set([json.dumps(x) for x in compression_options])
        for x in data:
            x["bench"] = [y for y in x["bench"] if y["kwargs"] in known_kwargs]
        # print("filtered compressions")
    data_cache[model_type + "_" + model_name] = data
    return data
Exemple #2
0
def append(obj, fn):
    from preconvert.output import json

    if not isinstance(fn, str):
        raise TypeError("Cannot append to compression")
    with open(fn, "a+") as f:
        f.write(json.dumps(obj) + "\n")
Exemple #3
0
def iwrite(obj, fn):
    from preconvert.output import json

    if not isinstance(fn, str):
        raise TypeError("Cannot iteratively write compressed")
    with open(fn, "w") as f:
        for chunk in obj:
            f.write(json.dumps(chunk) + "\n")
Exemple #4
0
def write(obj, fn):
    from preconvert.output import json

    if not isinstance(fn, str):
        fn.write(bytes(json.dumps(obj), encoding="utf8"))
    else:
        with open(fn, "w") as f:
            json.dump(obj, f, indent=4)
Exemple #5
0
def get_cache_file_name(domain, request_info, compression=".gz"):
    from preconvert.output import json

    key = json.dumps(request_info)
    m = hashlib.md5()
    m.update(key.encode("utf8"))
    md5 = m.hexdigest()
    dir_name, fname = md5[:3], md5[3:]
    if not compression:
        compression = ""
    return f"~/.just_requests/{domain}/{dir_name}/{fname}.json{compression}"
Exemple #6
0
    def run_benchmarks(self,
                       data_generator,
                       ignore_seen=True,
                       timeout=300,
                       save=True,
                       verbose=True):
        from preconvert.output import json

        if self.model_data is None:
            self.model_data = get_model_data(self.model_type, self.model_name,
                                             self.compression_options)
        feature_ids = set([x["feature_id"] for x in self.model_data])
        results = []
        index = []
        if isinstance(data_generator, (str, pd.DataFrame, dict)):
            data_generator = [data_generator]
        for num, df in enumerate(data_generator):
            df, status = self.cast_to_data(df)
            if df is None:
                print(status)
                continue
            stat_computation_time = time.time()
            try:
                features = self.get_features(df)
            except self.bench_exceptions:
                continue
            if features is None:
                continue
            feature_id = md5(features)
            if ignore_seen and feature_id in feature_ids:
                print("seen", feature_id)
                continue
            stat_computation_time = time.time() - stat_computation_time
            result = {
                "feature_id": feature_id,
                "features": features,
                "class": self.__class__.__name__,
                "stat_computation_time": stat_computation_time,
            }
            bench = self.benchmark(df, timeout, verbose=verbose)
            result["bench"] = bench
            if bench:
                model_data_path = shrynk_path("{}_{}.jsonl".format(
                    self.model_type, self.model_name))
                self.model_data.append(result)
                if save:
                    with open(model_data_path, "a") as f:
                        f.write(json.dumps(result) + "\n")
                results.append(result)

            feature_ids.add(feature_id)
        ### run benchmarks should return a total overview or something
        # return pd.DataFrame(bench).set_index("kwargs")
        return results
Exemple #7
0
    def benchmark(self, df, timeout=300, verbose=False):
        from preconvert.output import json

        bench = []
        for kwargs in self.compression_options:
            size, write_time, read_time = self.single_benchmark(
                df, kwargs, timeout)
            if size is None:
                # write_error(line)
                print("error, skipping", kwargs)
                continue
            if verbose:
                print(kwargs, size, write_time, read_time)
            bench.append({
                "kwargs": json.dumps(kwargs),
                "size": size,
                "write_time": write_time,
                "read_time": read_time,
            })

        return bench
Exemple #8
0
def md5(features):
    from preconvert.output import json

    return hashlib.md5(json.dumps(features, sort_keys=True).encode()).hexdigest()
Exemple #9
0
def get_benchmark_html(df, fname):
    features = pdc.get_features(df)
    bench_res = None
    save = False
    if IN_PRODUCTION:
        blob = get_blob(features)
        if blob.exists():
            results = json.loads(blob.download_as_string())
            bench_res = results["bench"]
        else:
            results = pdc.run_benchmarks(df,
                                         save=False,
                                         ignore_seen=False,
                                         timeout=False)[0]
            # make a copy not to pop kwargs from results object which will be saved
            bench_res = deepcopy(results)["bench"]
            save = True
    else:
        bench_res = pdc.run_benchmarks(df,
                                       save=False,
                                       ignore_seen=False,
                                       timeout=False)[0]["bench"]
    kwargs = [x.pop("kwargs") for x in bench_res]
    bench_res = pd.DataFrame(bench_res, index=kwargs)
    inferred = pdc.infer(features)
    z_name = "z {}".format(tuple(weights))
    bench_res[z_name] = (scale(bench_res) * weights).sum(axis=1)
    bench_res = bench_res.round(5).sort_values(z_name)
    bench_res = bench_res[[z_name, "size", "write_time", "read_time"]]
    y = json.dumps(inferred)
    res_index = [i + 1 for i, x in enumerate(bench_res.index) if x == y] + [-1]
    if save:
        ip = request.environ.get("HTTP_X_FORWARDED_FOR", "")
        ip = ip.split(",")[0]
        results["web"] = {
            "utctime": datetime.utcnow().isoformat(),
            "ip": ip,
            "predicted": inferred,
            "res_index": res_index[0],  # 1 is 1st, 2 is 2nd
            "filename": fname,
            "weights": weights.tolist(),
        }
        blob.upload_from_string(json.dumps(results))
        print("saved blob")
    bench_res.index = [
        " ".join(["{}={!r}".format(k, v) for k, v in json.loads(x).items()])
        for x in bench_res.index
    ]
    learning = "none" if res_index and res_index[0] == 1 else "inherit"
    nth = {
        1: "1st",
        2: "2nd",
        3: "3rd",
        -1: "999"
    }.get(res_index[0],
          str(res_index[0]) + "th")
    # upload(features, "{}-{}".format(file.filename, time.time()))
    features = {
        k.replace("quantile_proportion", "quantile"):
        round(v, 3) if isinstance(v, float) else v
        for k, v in features.items()
    }
    return str(
        Markup(
            '<center> <h5 class="tagline"> Results: </h5></center>' +
            '<div class="container" style="margin-top: 2rem"><div class="row">'
            +
            '<div class="col l10 offset-l2" style="padding-bottom: 2rem; padding-top: 1rem;">The data was featurized, and a prediction was made. Then, all the compressions were ran for this file so we can see if the prediction was correct (the ground truth).</div>'
            + '<div class="col s12 m6 l3 offset-l2">' + "<b>Filename: </b>" +
            fname + "<br><b>Features: </b>" + '<code class="codes">' +
            json.dumps(features, indent=4) + "</code>" + '</div>' +
            '<div class="col s12 m6 l3 offset-l3">' +
            "<br><center style='line-height: 3'><b>Predicted: </b><br>"
            # just using features here instead of data to be faster
            + " ".join(["{}={!r}".format(k, v) for k, v in inferred.items()]) +
            "<br><b>Result:</b><br><span class='result {}'>{}</span> / {}<br><div style='display: {}'><span style='color: #ee6e73'>Wrong!</span> We will learn from this...</div>"
            .format(nth[-2:], nth, bench_res.shape[0], learning) +
            "</center></div></div>" +
            "<center><h4>Ground truth</h4><div class='show-on-small hide-on-med-and-up' style='padding: 0.5rem; color: grey'> -- scroll -> </center>"
            + replacenth(
                format_res(bench_res, tuple(weights), fname),
                "<tr ",
                '<tr class="resultinv {}" '.format(nth[-2:]),
                int(nth[:-2]),
            )))
Exemple #10
0
    def run_benchmarks(self,
                       data_generator,
                       save=True,
                       ignore_seen=True,
                       timeout=300):
        from preconvert.output import json

        model_path = os.path.expanduser("~/shrynk_{}.jsonl".format(
            self.model_name))
        if self.model_data is None:
            self.model_data = get_model_data(self.model_name,
                                             self.compression_options)
        feature_ids = set([x["feature_id"] for x in self.model_data])
        results = []
        index = []
        if isinstance(data_generator, pd.DataFrame):
            data_generator = [data_generator]
        for num, df in enumerate(data_generator):
            if isinstance(df, str) and os.path.isfile(df):
                try:
                    df = self.load(df)
                except self.bench_exceptions:
                    continue
            if df is None:
                continue
            if isinstance(df, str):
                continue
            stat_computation_time = time.time()
            features = self.get_features(df)
            if features is None:
                continue
            feature_id = md5(features)
            if ignore_seen and feature_id in feature_ids:
                print("seen", feature_id)
                continue
            stat_computation_time = time.time() - stat_computation_time
            result = {
                "feature_id": feature_id,
                "features": features,
                "class": self.__class__.__name__,
                "stat_computation_time": stat_computation_time,
            }
            bench = []
            for kwargs in self.compression_options:
                size, write_time, read_time = self.benchmark(
                    df, kwargs, timeout)
                if size is None:
                    # write_error(line)
                    print("error, skipping", kwargs)
                    continue
                print(kwargs, size, write_time, read_time)
                bench.append({
                    "kwargs": json.dumps(kwargs),
                    "size": size,
                    "write_time": write_time,
                    "read_time": read_time,
                })
            result["bench"] = bench
            if bench:
                self.model_data.append(result)
                if save:
                    with open(model_path, "a") as f:
                        f.write(json.dumps(result) + "\n")
                results.append(result)

            feature_ids.add(feature_id)
        ### run benchmarks should return a total overview or something, but now just from the last df
        # return pd.DataFrame(bench).set_index("kwargs")
        return results