def make_phase_1(): for d, k in [(3, 3), (3, 5), (3, 8), (5, 3), (8, 3)]: dst = get_dirname(d, 80 * d * k, k, 0.8, makedir=True) print(dst) for _ in tqdm(range(100)): ds = bmcc.GaussianMixture( n=80 * d * k, k=k, d=d, r=0.8, alpha=40, df=d, symmetric=False, shuffle=False) ds.save(os.path.join(dst, str(uuid.uuid4())))
def run_evaluate(args): """Evaluate samples""" path, method_name = args result_dir = path.replace(BASE_DIR, os.path.join(RESULT_DIR, method_name)) eval_dir = path.replace(BASE_DIR, os.path.join(EVAL_DIR, method_name)) # Ignore if already run if os.path.exists(eval_dir + "_scores.json"): return # Load dataset dataset = bmcc.GaussianMixture(path, load=True) hist = np.load(result_dir)['hist'] if hist.shape[0] < 2: with open(eval_dir + "_scores.json", "w") as f: f.write(json.dumps({"errored": "no iterations saved"})) return # If procedure terminates before 2000it (400 once thinned), use 2nd half of # samples bi_base = min(int(hist.shape[0] / 2), 200) # Base evaluation # We don't care about the oracle matrix for now, so skip computation res = bmcc.LstsqResult(dataset.data, hist, burn_in=bi_base) res.evaluate( dataset.assignments, oracle=dataset.oracle, oracle_matrix=None) # save trace save_fig(res.trace(), eval_dir + "_trace") # Save scores scores = { "rand": res.rand_best, "nmi": res.nmi_best, "oracle_rand": res.oracle_rand, "oracle_nmi": res.oracle_nmi, "num_clusters": int(res.num_clusters[res.best_idx]), "best_idx": int(res.best_idx), "aggregation": res.aggregation_best, "segregation": res.segregation_best, "oracle_aggregation": res.oracle_aggregation, "oracle_segregation": res.oracle_segregation, "iterations": hist.shape[0] } with open(eval_dir + "_scores.json", "w") as f: f.write(json.dumps(scores))
def make_phase_2(): for d in [3, 4, 5, 6, 8, 10, 12, 15, 18, 21]: for n in [600, 800, 1000]: dst = get_dirname(d, n, 3, 1.0, makedir=True) print(dst) for _ in tqdm(range(100)): ds = bmcc.GaussianMixture( n=n, k=3, d=d, r=1.0, alpha=40, df=d, symmetric=False, shuffle=False) ds.save(os.path.join(dst, str(uuid.uuid4())))
def run_sample(args): """Run MCMC sampling""" # Unpack path, method_name = args dst = path.replace( BASE_DIR, os.path.join(RESULT_DIR, method_name)) # Ignore if test already run (file present) if os.path.exists(dst): return # Load dataset dataset = bmcc.GaussianMixture(path, load=True) # Fetch method method = METHODS[method_name] cm = bmcc.NormalWishart( df=dataset.d, scale=np.identity(dataset.d) if SCALE_MATRIX else None) mm = method["mixture"](dataset.k) # Create model model = bmcc.BayesianMixture( data=dataset.data, sampler=method["sampler"], component_model=cm, mixture_model=mm, assignments=np.zeros(dataset.n).astype(np.uint16), thinning=5) # Run iterations (break on exceeding limit) try: for i in range(5000): model.iter() if np.max(model.assignments) > CLUSTERS_LIMIT: break except Exception as e: print("Exception in {} / {}:".format(method_name, path)) print(e) np.savez(dst, hist=model.hist)
def make_phase_3(): for k in [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100]: dst = get_dirname(3, 50 * k, k, 1.0, makedir=True) print(dst) for _ in tqdm(range(100)): # means = [ # (40 * k)**(1/3) * np.random.uniform( # low=-0.5, high=0.5, size=3) # for _ in range(k) # ] means = [ (40 * k)**(1 / 3) * np.array(x) for x in ghalton.Halton(3).get(k) ] ds = bmcc.GaussianMixture( n=50 * k, k=k, d=3, r=1.0, df=3, symmetric=False, shuffle=False, means=means) ds.save(os.path.join(dst, str(uuid.uuid4())))
from scipy.stats import poisson import time from tqdm import tqdm import bmcc # Settings ITERATIONS = 500 THINNING = 5 POINTS = 500 BURN_IN = 50 # Create dataset dataset = bmcc.GaussianMixture(n=POINTS, k=3, d=3, r=0.7, alpha=40, df=3, symmetric=False, shuffle=False) def hybrid(*args, **kwargs): for _ in range(5): bmcc.gibbs(*args, **kwargs) bmcc.split_merge(*args, **kwargs) mm = bmcc.MFM(gamma=1, prior=lambda k: poisson.logpmf(k, 3)) # mm = bmcc.DPM(alpha=1, use_eb=False) cm = bmcc.NormalWishart(df=3)
import bmcc dataset = bmcc.GaussianMixture(n=1000, k=3, d=3, r=0.7, alpha=40, df=3, symmetric=False, shuffle=False) dataset.plot_actual(plot=True) dataset.save("tmp.npz") dataset = bmcc.GaussianMixture("tmp.npz", load=True) dataset.plot_actual(plot=True)