def show_fig(fig, title): #TODO maybe be able to overwrite this with an env-var is_pycharm = "PYCHARM_HOSTED" in os.environ if is_pycharm and not isnotebook(): fig.show() if not isnotebook(): save_path = generate_filepath(title) fig.savefig(save_path)
def display_output(self, objname, ignore_err=False): txt = merge_streams(self.p.loaded_objects[objname]["metadata"]["obj_info"]["stdout"], self.p.loaded_objects[objname]["metadata"]["obj_info"]["stderr"] if not ignore_err else "", objname) if isnotebook(): txt = txt.replace("\n", "<br>") for k, v in TRANSLATOR.items(): if k != "end": txt = txt.replace(v, k) while color.END in txt: # rather than replacing with end, you should replace with whatever replacement comes before this sign_before = max({i: txt[:txt.find(color.END)].rfind(i) for i in set(TRANSLATOR.keys())-{"end"}}.items(), key=lambda x:x[1])[0] txt = txt.replace(color.END, sign_before, 1) return txt
import time from multiprocessing import JoinableQueue, Process from misc_util.pretty_print import isnotebook if isnotebook(): from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm class WorkerPool(): def __init__(self, n_workers, workerobj=None, pgbar=None, comqu=None): self.pgbar = pgbar if pgbar is None else pgbar + f" [{n_workers} procs]" self.n_workers = n_workers if n_workers > 1: self.qu = JoinableQueue() self.prioqu = JoinableQueue() self.donequ = JoinableQueue() self.workers = [ Worker(self.qu, self.prioqu, self.donequ, num, workerobj) for num in range(n_workers) ] self.known_deaths = [] self.comqu = comqu def __enter__(self): return self def work(self, iterable, func, enum_start=0): iterable = list(enumerate(iterable, start=enum_start))
def classify_shallowtree(clusters, embedding, descriptions, dataset_class, one_vs_rest, dt_depth, test_percentage_crossval, classes=None, cluster_reprs=None, do_plot=False, verbose=False, return_features=True, metric="acc", balance_classes=True, clus_rep_algo=None, shutup=False, repeat=1, pgbar=None, also_unweighted=False, **kwargs): cm = (lambda *args, **kwargs: nullcontext()) if pgbar is None else tqdm clusters, planes = clusters.values() if classes is None: classes = descriptions.additionals_names[0] if classes in descriptions.additionals_names: catnames = None if hasattr(dataset_class, "CATNAMES") and classes in dataset_class.CATNAMES: catnames = dataset_class.CATNAMES.get(classes) hascat = [ n for n, i in enumerate(descriptions._descriptions) if i._additionals[classes] is not None ] getcat = lambda i: descriptions._descriptions[i]._additionals[classes] elif hasattr(dataset_class, "get_custom_class"): getcat, hascat, catnames = dataset_class.get_custom_class( classes, descriptions, verbose=(verbose and not shutup), **kwargs) else: raise Exception(f"The class {classes} does not exist!") if catnames: orig_getcat = getcat getcat = lambda x: catnames.get(int(orig_getcat(x)), orig_getcat(x)) if not shutup: print( f"Using classes from {classes} - {len(hascat)}/{len(descriptions)} entities have a class" ) cats = {i: getcat(i) for i in hascat} cluster_names = get_name_dict(clusters, cluster_reprs, clus_rep_algo) #first I want the distances to the origins of the respective dimensions (induced by the clusters), what induces the respective rankings (see DESC15 p.24u, proj2 of load_semanticspaces.load_projections) axis_dists = { i: {cluster_names[k]: v.dist(embedding[i]) for k, v in planes.items()} for i in hascat } if verbose and not shutup: best_per_dim = { k: descriptions._descriptions[v].title for k, v in pd.DataFrame(axis_dists).T.idxmax().to_dict().items() } print( "Highest-ranking descriptions [with any class] per dimension:\n " + "\n ".join([ f"*b*{k.ljust(max([len(i) for i in best_per_dim.keys()][:20]))}*b*: {v}" for k, v in best_per_dim.items() ][:20])) #TODO also show places 2, 3, 4 - hier sehen wir wieder sehr ähnliche ("football stadium", "stadium", "fan" for "goalie") #TODO axis_dists is all I need for the movietuner already!! I can say "give me something like X, only with more Y" if not shutup: print(f"Labels ({len(set(cats.values()))} classes):", ", ".join(f"*b*{k}*b*: {v}" for k, v in Counter(cats.values( )).items())) #TODO pay attention! consider class_weight etc! consider = pd.DataFrame( {descriptions._descriptions[i].title: axis_dists[i] for i in hascat}) ranked = pd.DataFrame([rankdata(i) for i in consider.values], index=consider.index, columns=consider.columns).astype(int).T ranked = ranked / ranked.shape[ 0] #looks better if we're doing relative rankings #TODO Teilweise sind Dinge ja in mehreren Klassen, da muss ich dann ja mehrere trees pro class machen! if not shutup: print( f'Eval-Settings: metric: *b*{metric}*b*, type: *b*{("one-vs-rest" if one_vs_rest else "all-at-once")}*b*, DT-Depth: *b*{dt_depth}*b*, train-test-split:*b*', f'{test_percentage_crossval}-fold cross-validation' if test_percentage_crossval > 1 else f'{test_percentage_crossval*100:.1f}% in test-set', "*b*") class_percents = sorted( [i / len(cats) for i in Counter(cats.values()).values()], reverse=True) features_outvar = [] all_targets = [] all_classes = [] if one_vs_rest: scores = {} plottree_strs = [] raw_scores = {} with cm(total=len(set(cats.values())) * repeat, desc=pgbar if isinstance(pgbar, str) else None) as pgbar: for cat in set(cats.values()): targets = np.array(np.array(list(cats.values())) == cat, dtype=int) for n in range(repeat): scores[cat], plottree_str, rawscores = classify( ranked.values, targets, list(ranked.columns), ["other", cat], dt_depth, test_percentage_crossval, metric=metric, do_plot=do_plot, features_outvar=features_outvar, balance_classes=balance_classes, do_render=False, shuffle=(repeat > 1)) raw_scores.setdefault(cat, []).append(rawscores) if pgbar is not None: pgbar.update(1) plottree_strs.append(plottree_str) all_targets.append(targets) all_classes.append(["other", cat]) if do_plot and not shutup: #holy shit that merging took a while - see https://stackoverflow.com/q/47258673/5122790 for how dot_cnts = [ subprocess.run(["dot"], stdout=subprocess.PIPE, input=str(i), encoding="UTF-8").stdout for n, i in enumerate(plottree_strs) ] if not isnotebook(): a = subprocess.run(["dot"], stdout=subprocess.PIPE, input="\n".join(dot_cnts), encoding="UTF-8").stdout b = subprocess.run(["gvpack", "-array_t2", "-m20"], stdout=subprocess.PIPE, input=a, encoding="UTF-8").stdout b = re.sub(r"<br\/>value = \[.*?\]", "", b) b = re.sub(r" ≤ 0.(\d\d)(\d*)<br", r" ≤ \1.\2% <br", b) subprocess.run( ["neato", "-n2", "-s", "-Tpdf", "-o", "merge.pdf"], stdout=subprocess.PIPE, input=b, encoding="UTF-8") print(f"Saved under {abspath('merge.pdf')}") os.system("xdg-open merge.pdf") else: JUPYTER_N_COLS = 2 plots = [ subprocess.run(["gvpack", "-m50"], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, input="\n".join(dot_cnts[i:i + JUPYTER_N_COLS]), encoding="UTF-8").stdout for i in range(0, len(dot_cnts), JUPYTER_N_COLS) ] for plot in plots: display(graphviz.Source(plot)) print("\n\n") if repeat > 1 and isinstance( metric, (list, tuple)): #this is dirty as f**k but whelp, time is getting knapp return raw_scores, Counter(cats.values()) score = sum([v * Counter(cats.values())[k] for k, v in scores.items()]) / len(cats) if not shutup: print("Per-Class-Scores:", ", ".join(f"{k}: {v:.2f}" for k, v in scores.items())) print( f"Unweighted Mean {metric}: {sum(scores.values()) / len(scores.values()):.2%}" ) print(f"Weighted Mean {metric}: {score:.2%}") else: #all at once raw_scores = [] if dt_depth is not None and len(set(cats.values())) > 2**dt_depth: warnings.warn( f"There are more classes ({len(set(cats.values()))}) than your decision-tree can possibly classify ({2**dt_depth})" ) targets = np.array(list(cats.values())) all_targets.append(targets) for n in range(repeat): score, _, rawscores = classify(ranked.values, targets, list(ranked.columns), list(catnames.values()), dt_depth, test_percentage_crossval, metric=metric, do_plot=do_plot, features_outvar=features_outvar, balance_classes=balance_classes, do_render=False) raw_scores.append(rawscores) all_classes.append(list(catnames.values())) if not shutup and repeat <= 1: print(f"{metric}: {score:.2f}") if dt_depth == 1: print( f"Baseline {metric}: {class_percents[0]:.2f}" ) #all into one class. Praxis is often worse than this because we have class_weight=balanced. if metric.lower() in ["acc", "accuracy"]: print( f"Maximally achievable {metric}: {sum(class_percents[:2]):.2f}" ) #two leaves, one is the (perfectly classified) class 1, the other get's the label for the second-most-common class if repeat > 1: return raw_scores, Counter(cats.values()) if return_features: return [ features_outvar, ranked, all_targets, list(scores.values()) if one_vs_rest else [score], all_classes, list(ranked.columns) ] return score if not also_unweighted else ( score, sum(scores.values()) / len(scores.values()) if one_vs_rest else np.nan)