Ejemplo n.º 1
0
def get_model_data(model_type, model_name, compression_options=None):
    from preconvert.output import json

    if model_name in data_cache:
        return data_cache[model_type + "_" + model_name]
    """ Gets the model data """
    try:
        data = pkgutil.get_data("data", "shrynk/{}_{}.jsonl.gzip".format(model_type, model_name))
        data = [
            json.loads(line) for line in decompress(data).decode("utf8").split("\n") if line.strip()
        ]
        # print("from package")
    except FileNotFoundError:
        try:
            with open(shrynk_path("{}_{}.jsonl".format(model_type, model_name))) as f:
                data = [json.loads(x) for x in f.read().split("\n") if x]
        except FileNotFoundError:
            data = []
    if compression_options is not None:
        known_kwargs = set([json.dumps(x) for x in compression_options])
        for x in data:
            x["bench"] = [y for y in x["bench"] if y["kwargs"] in known_kwargs]
        # print("filtered compressions")
    data_cache[model_type + "_" + model_name] = data
    return data
Ejemplo n.º 2
0
def iread(fn):
    from preconvert.output import json

    if not isinstance(fn, str):
        raise TypeError("Cannot iteratively read compressed file now")
    with open(fn) as f:
        for i, line in enumerate(f):
            try:
                yield json.loads(line)
            except Exception as e:
                msg = "JSON-L parsing error in line number {} in the jsonl file".format(
                    i)
                raise Exception(msg, line)
Ejemplo n.º 3
0
    def predict(self, features):
        from preconvert.output import json

        if isinstance(features, pd.DataFrame):
            features = self.get_features(features)
        if isinstance(features, dict):
            features = pd.DataFrame([features])
        warnings.filterwarnings(module='sklearn*',
                                action='ignore',
                                category=DeprecationWarning)
        pred = self.clf.predict(features.fillna(-100))[0]
        if not isinstance(pred, str):
            pred = pred[0]
        return json.loads(pred)
Ejemplo n.º 4
0
def get_benchmark_html(df, fname):
    features = pdc.get_features(df)
    bench_res = None
    save = False
    if IN_PRODUCTION:
        blob = get_blob(features)
        if blob.exists():
            results = json.loads(blob.download_as_string())
            bench_res = results["bench"]
        else:
            results = pdc.run_benchmarks(df,
                                         save=False,
                                         ignore_seen=False,
                                         timeout=False)[0]
            # make a copy not to pop kwargs from results object which will be saved
            bench_res = deepcopy(results)["bench"]
            save = True
    else:
        bench_res = pdc.run_benchmarks(df,
                                       save=False,
                                       ignore_seen=False,
                                       timeout=False)[0]["bench"]
    kwargs = [x.pop("kwargs") for x in bench_res]
    bench_res = pd.DataFrame(bench_res, index=kwargs)
    inferred = pdc.infer(features)
    z_name = "z {}".format(tuple(weights))
    bench_res[z_name] = (scale(bench_res) * weights).sum(axis=1)
    bench_res = bench_res.round(5).sort_values(z_name)
    bench_res = bench_res[[z_name, "size", "write_time", "read_time"]]
    y = json.dumps(inferred)
    res_index = [i + 1 for i, x in enumerate(bench_res.index) if x == y] + [-1]
    if save:
        ip = request.environ.get("HTTP_X_FORWARDED_FOR", "")
        ip = ip.split(",")[0]
        results["web"] = {
            "utctime": datetime.utcnow().isoformat(),
            "ip": ip,
            "predicted": inferred,
            "res_index": res_index[0],  # 1 is 1st, 2 is 2nd
            "filename": fname,
            "weights": weights.tolist(),
        }
        blob.upload_from_string(json.dumps(results))
        print("saved blob")
    bench_res.index = [
        " ".join(["{}={!r}".format(k, v) for k, v in json.loads(x).items()])
        for x in bench_res.index
    ]
    learning = "none" if res_index and res_index[0] == 1 else "inherit"
    nth = {
        1: "1st",
        2: "2nd",
        3: "3rd",
        -1: "999"
    }.get(res_index[0],
          str(res_index[0]) + "th")
    # upload(features, "{}-{}".format(file.filename, time.time()))
    features = {
        k.replace("quantile_proportion", "quantile"):
        round(v, 3) if isinstance(v, float) else v
        for k, v in features.items()
    }
    return str(
        Markup(
            '<center> <h5 class="tagline"> Results: </h5></center>' +
            '<div class="container" style="margin-top: 2rem"><div class="row">'
            +
            '<div class="col l10 offset-l2" style="padding-bottom: 2rem; padding-top: 1rem;">The data was featurized, and a prediction was made. Then, all the compressions were ran for this file so we can see if the prediction was correct (the ground truth).</div>'
            + '<div class="col s12 m6 l3 offset-l2">' + "<b>Filename: </b>" +
            fname + "<br><b>Features: </b>" + '<code class="codes">' +
            json.dumps(features, indent=4) + "</code>" + '</div>' +
            '<div class="col s12 m6 l3 offset-l3">' +
            "<br><center style='line-height: 3'><b>Predicted: </b><br>"
            # just using features here instead of data to be faster
            + " ".join(["{}={!r}".format(k, v) for k, v in inferred.items()]) +
            "<br><b>Result:</b><br><span class='result {}'>{}</span> / {}<br><div style='display: {}'><span style='color: #ee6e73'>Wrong!</span> We will learn from this...</div>"
            .format(nth[-2:], nth, bench_res.shape[0], learning) +
            "</center></div></div>" +
            "<center><h4>Ground truth</h4><div class='show-on-small hide-on-med-and-up' style='padding: 0.5rem; color: grey'> -- scroll -> </center>"
            + replacenth(
                format_res(bench_res, tuple(weights), fname),
                "<tr ",
                '<tr class="resultinv {}" '.format(nth[-2:]),
                int(nth[:-2]),
            )))