Example #1
0
def show_fig(fig, title):
    #TODO maybe be able to overwrite this with an env-var
    is_pycharm = "PYCHARM_HOSTED" in os.environ
    if is_pycharm and not isnotebook():
        fig.show()
    if not isnotebook():
        save_path = generate_filepath(title)
        fig.savefig(save_path)
 def display_output(self, objname, ignore_err=False):
     txt = merge_streams(self.p.loaded_objects[objname]["metadata"]["obj_info"]["stdout"],
                         self.p.loaded_objects[objname]["metadata"]["obj_info"]["stderr"] if not ignore_err else "",
                         objname)
     if isnotebook():
         txt = txt.replace("\n", "<br>")
     for k, v in TRANSLATOR.items():
         if k != "end":
             txt = txt.replace(v, k)
     while color.END in txt: # rather than replacing with end, you should replace with whatever replacement comes before this
         sign_before = max({i: txt[:txt.find(color.END)].rfind(i) for i in set(TRANSLATOR.keys())-{"end"}}.items(), key=lambda x:x[1])[0]
         txt = txt.replace(color.END, sign_before, 1)
     return txt
Example #3
0
import time
from multiprocessing import JoinableQueue, Process

from misc_util.pretty_print import isnotebook

if isnotebook():
    from tqdm import tqdm_notebook as tqdm
else:
    from tqdm import tqdm


class WorkerPool():
    def __init__(self, n_workers, workerobj=None, pgbar=None, comqu=None):
        self.pgbar = pgbar if pgbar is None else pgbar + f" [{n_workers} procs]"
        self.n_workers = n_workers
        if n_workers > 1:
            self.qu = JoinableQueue()
            self.prioqu = JoinableQueue()
            self.donequ = JoinableQueue()
            self.workers = [
                Worker(self.qu, self.prioqu, self.donequ, num, workerobj)
                for num in range(n_workers)
            ]
            self.known_deaths = []
            self.comqu = comqu

    def __enter__(self):
        return self

    def work(self, iterable, func, enum_start=0):
        iterable = list(enumerate(iterable, start=enum_start))
def classify_shallowtree(clusters,
                         embedding,
                         descriptions,
                         dataset_class,
                         one_vs_rest,
                         dt_depth,
                         test_percentage_crossval,
                         classes=None,
                         cluster_reprs=None,
                         do_plot=False,
                         verbose=False,
                         return_features=True,
                         metric="acc",
                         balance_classes=True,
                         clus_rep_algo=None,
                         shutup=False,
                         repeat=1,
                         pgbar=None,
                         also_unweighted=False,
                         **kwargs):
    cm = (lambda *args, **kwargs: nullcontext()) if pgbar is None else tqdm
    clusters, planes = clusters.values()
    if classes is None:
        classes = descriptions.additionals_names[0]
    if classes in descriptions.additionals_names:
        catnames = None
        if hasattr(dataset_class,
                   "CATNAMES") and classes in dataset_class.CATNAMES:
            catnames = dataset_class.CATNAMES.get(classes)
        hascat = [
            n for n, i in enumerate(descriptions._descriptions)
            if i._additionals[classes] is not None
        ]
        getcat = lambda i: descriptions._descriptions[i]._additionals[classes]
    elif hasattr(dataset_class, "get_custom_class"):
        getcat, hascat, catnames = dataset_class.get_custom_class(
            classes, descriptions, verbose=(verbose and not shutup), **kwargs)
    else:
        raise Exception(f"The class {classes} does not exist!")
    if catnames:
        orig_getcat = getcat
        getcat = lambda x: catnames.get(int(orig_getcat(x)), orig_getcat(x))

    if not shutup:
        print(
            f"Using classes from {classes} - {len(hascat)}/{len(descriptions)} entities have a class"
        )
    cats = {i: getcat(i) for i in hascat}

    cluster_names = get_name_dict(clusters, cluster_reprs, clus_rep_algo)
    #first I want the distances to the origins of the respective dimensions (induced by the clusters), what induces the respective rankings (see DESC15 p.24u, proj2 of load_semanticspaces.load_projections)
    axis_dists = {
        i: {cluster_names[k]: v.dist(embedding[i])
            for k, v in planes.items()}
        for i in hascat
    }
    if verbose and not shutup:
        best_per_dim = {
            k: descriptions._descriptions[v].title
            for k, v in pd.DataFrame(axis_dists).T.idxmax().to_dict().items()
        }
        print(
            "Highest-ranking descriptions [with any class] per dimension:\n    "
            + "\n    ".join([
                f"*b*{k.ljust(max([len(i) for i in best_per_dim.keys()][:20]))}*b*: {v}"
                for k, v in best_per_dim.items()
            ][:20]))
    #TODO also show places 2, 3, 4 - hier sehen wir wieder sehr ähnliche ("football stadium", "stadium", "fan" for "goalie")
    #TODO axis_dists is all I need for the movietuner already!! I can say "give me something like X, only with more Y"

    if not shutup:
        print(f"Labels ({len(set(cats.values()))} classes):",
              ", ".join(f"*b*{k}*b*: {v}" for k, v in Counter(cats.values(
              )).items()))  #TODO pay attention! consider class_weight etc!
    consider = pd.DataFrame(
        {descriptions._descriptions[i].title: axis_dists[i]
         for i in hascat})
    ranked = pd.DataFrame([rankdata(i) for i in consider.values],
                          index=consider.index,
                          columns=consider.columns).astype(int).T
    ranked = ranked / ranked.shape[
        0]  #looks better if we're doing relative rankings
    #TODO Teilweise sind Dinge ja in mehreren Klassen, da muss ich dann ja mehrere trees pro class machen!
    if not shutup:
        print(
            f'Eval-Settings: metric: *b*{metric}*b*, type: *b*{("one-vs-rest" if one_vs_rest else "all-at-once")}*b*, DT-Depth: *b*{dt_depth}*b*, train-test-split:*b*',
            f'{test_percentage_crossval}-fold cross-validation'
            if test_percentage_crossval > 1 else
            f'{test_percentage_crossval*100:.1f}% in test-set', "*b*")
    class_percents = sorted(
        [i / len(cats) for i in Counter(cats.values()).values()], reverse=True)
    features_outvar = []
    all_targets = []
    all_classes = []
    if one_vs_rest:
        scores = {}
        plottree_strs = []
        raw_scores = {}
        with cm(total=len(set(cats.values())) * repeat,
                desc=pgbar if isinstance(pgbar, str) else None) as pgbar:
            for cat in set(cats.values()):
                targets = np.array(np.array(list(cats.values())) == cat,
                                   dtype=int)
                for n in range(repeat):
                    scores[cat], plottree_str, rawscores = classify(
                        ranked.values,
                        targets,
                        list(ranked.columns), ["other", cat],
                        dt_depth,
                        test_percentage_crossval,
                        metric=metric,
                        do_plot=do_plot,
                        features_outvar=features_outvar,
                        balance_classes=balance_classes,
                        do_render=False,
                        shuffle=(repeat > 1))
                    raw_scores.setdefault(cat, []).append(rawscores)
                    if pgbar is not None: pgbar.update(1)
                plottree_strs.append(plottree_str)
                all_targets.append(targets)
                all_classes.append(["other", cat])
        if do_plot and not shutup:  #holy shit that merging took a while - see https://stackoverflow.com/q/47258673/5122790 for how
            dot_cnts = [
                subprocess.run(["dot"],
                               stdout=subprocess.PIPE,
                               input=str(i),
                               encoding="UTF-8").stdout
                for n, i in enumerate(plottree_strs)
            ]
            if not isnotebook():
                a = subprocess.run(["dot"],
                                   stdout=subprocess.PIPE,
                                   input="\n".join(dot_cnts),
                                   encoding="UTF-8").stdout
                b = subprocess.run(["gvpack", "-array_t2", "-m20"],
                                   stdout=subprocess.PIPE,
                                   input=a,
                                   encoding="UTF-8").stdout
                b = re.sub(r"<br\/>value = \[.*?\]", "", b)
                b = re.sub(r" &le; 0.(\d\d)(\d*)<br", r" &le; \1.\2% <br", b)
                subprocess.run(
                    ["neato", "-n2", "-s", "-Tpdf", "-o", "merge.pdf"],
                    stdout=subprocess.PIPE,
                    input=b,
                    encoding="UTF-8")
                print(f"Saved under {abspath('merge.pdf')}")
                os.system("xdg-open merge.pdf")
            else:
                JUPYTER_N_COLS = 2
                plots = [
                    subprocess.run(["gvpack", "-m50"],
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.DEVNULL,
                                   input="\n".join(dot_cnts[i:i +
                                                            JUPYTER_N_COLS]),
                                   encoding="UTF-8").stdout
                    for i in range(0, len(dot_cnts), JUPYTER_N_COLS)
                ]
                for plot in plots:
                    display(graphviz.Source(plot))
                    print("\n\n")
        if repeat > 1 and isinstance(
                metric,
            (list,
             tuple)):  #this is dirty as f**k but whelp, time is getting knapp
            return raw_scores, Counter(cats.values())
        score = sum([v * Counter(cats.values())[k]
                     for k, v in scores.items()]) / len(cats)
        if not shutup:
            print("Per-Class-Scores:",
                  ", ".join(f"{k}: {v:.2f}" for k, v in scores.items()))
            print(
                f"Unweighted Mean {metric}: {sum(scores.values()) / len(scores.values()):.2%}"
            )
            print(f"Weighted Mean {metric}: {score:.2%}")
    else:  #all at once
        raw_scores = []
        if dt_depth is not None and len(set(cats.values())) > 2**dt_depth:
            warnings.warn(
                f"There are more classes ({len(set(cats.values()))}) than your decision-tree can possibly classify ({2**dt_depth})"
            )
        targets = np.array(list(cats.values()))
        all_targets.append(targets)
        for n in range(repeat):
            score, _, rawscores = classify(ranked.values,
                                           targets,
                                           list(ranked.columns),
                                           list(catnames.values()),
                                           dt_depth,
                                           test_percentage_crossval,
                                           metric=metric,
                                           do_plot=do_plot,
                                           features_outvar=features_outvar,
                                           balance_classes=balance_classes,
                                           do_render=False)
            raw_scores.append(rawscores)
            all_classes.append(list(catnames.values()))
        if not shutup and repeat <= 1:
            print(f"{metric}: {score:.2f}")
            if dt_depth == 1:
                print(
                    f"Baseline {metric}: {class_percents[0]:.2f}"
                )  #all into one class. Praxis is often worse than this because we have class_weight=balanced.
                if metric.lower() in ["acc", "accuracy"]:
                    print(
                        f"Maximally achievable {metric}: {sum(class_percents[:2]):.2f}"
                    )  #two leaves, one is the (perfectly classified) class 1, the other get's the label for the second-most-common class
    if repeat > 1:
        return raw_scores, Counter(cats.values())
    if return_features:
        return [
            features_outvar, ranked, all_targets,
            list(scores.values()) if one_vs_rest else [score], all_classes,
            list(ranked.columns)
        ]
    return score if not also_unweighted else (
        score,
        sum(scores.values()) / len(scores.values()) if one_vs_rest else np.nan)