def check_score_progress(model_name, latent, data, seed, kernel_config, init_type=None, ITERS_TO_RUN=ITERS_TO_RUN): print "Running", model_name, "*" * 40, np.random.randint(0, 10000) np.random.seed(seed) new_latent, new_data = irm.data.synth.prior_generate(latent, data) # estimate suffstats from the data run_truth = runner.Runner(new_latent, new_data, kernel_config, seed=0) print "latent=", new_latent irmio.estimate_suffstats(run_truth.model, run_truth.rng) print "estimated ss" # get ground truth ground_truth_score = run_truth.get_score() cleaned_up_latent = run_truth.get_state() rand_init = copy.deepcopy(cleaned_up_latent) # random init -- just the discrete variables # for the time being for di in cleaned_up_latent['domains']: d_N = len(rand_init['domains'][di]['assignment']) rand_init['domains'][di]['assignment'] = irm.util.crp_draw(d_N, 4.0) for ri in cleaned_up_latent['relations']: del rand_init['relations'][ri]['ss'] run_actual = runner.Runner(rand_init, new_data, kernel_config, seed=seed) rand_init_score = run_actual.get_score() print "rand_init_score=", rand_init_score print "ground_truth_score=", ground_truth_score assert_greater(ground_truth_score, rand_init_score) if init_type != None: run_actual.init(init_type) iter_count = 0 ITER_OVER = 10000 while (run_actual.get_score() - ground_truth_score) < -50: # well this is sort of bullshit run_actual.run_iters(ITERS_TO_RUN) iter_count += ITERS_TO_RUN print iter_count, model_name, run_actual.get_score( ), ground_truth_score assert_less(iter_count, ITERS_TO_RUN * ITER_OVER, "Too many iterations to get good score")
def check_score_progress(model_name, latent, data, seed, kernel_config, init_type = None, ITERS_TO_RUN=ITERS_TO_RUN): print "Running", model_name, "*"*40, np.random.randint(0, 10000) np.random.seed(seed) new_latent, new_data = irm.data.synth.prior_generate(latent, data) # estimate suffstats from the data run_truth = runner.Runner(new_latent, new_data, kernel_config, seed=0) print "latent=", new_latent irmio.estimate_suffstats(run_truth.model, run_truth.rng) print "estimated ss" # get ground truth ground_truth_score = run_truth.get_score() cleaned_up_latent = run_truth.get_state() rand_init = copy.deepcopy(cleaned_up_latent) # random init -- just the discrete variables # for the time being for di in cleaned_up_latent['domains']: d_N = len(rand_init['domains'][di]['assignment']) rand_init['domains'][di]['assignment'] = irm.util.crp_draw(d_N, 4.0) for ri in cleaned_up_latent['relations']: del rand_init['relations'][ri]['ss'] run_actual = runner.Runner(rand_init, new_data, kernel_config, seed=seed) rand_init_score = run_actual.get_score() print "rand_init_score=", rand_init_score print "ground_truth_score=", ground_truth_score assert_greater(ground_truth_score, rand_init_score ) if init_type != None: run_actual.init(init_type) iter_count = 0 ITER_OVER = 10000 while (run_actual.get_score() - ground_truth_score) < -50: # well this is sort of bullshit run_actual.run_iters(ITERS_TO_RUN) iter_count += ITERS_TO_RUN print iter_count, model_name, run_actual.get_score(), ground_truth_score assert_less(iter_count, ITERS_TO_RUN*ITER_OVER, "Too many iterations to get good score")
def run_benchmark(infile, outfile, model_name, latent, data, seed, kernel_name, iters_to_run, GROUP_N, ENTITIES_PER_GROUP, relclass): kernel_config = KERNELS[kernel_name] np.random.seed(seed) new_latent, new_data = irm.data.synth.prior_generate(latent, data) # estimate suffstats from the data if relclass == 'relation': threadpool = None else: threadpool = irm.ThreadPool(8) run_truth = runner.Runner(new_latent, new_data, kernel_config, seed=0, relation_class=RELATION_CLASSES[relclass], threadpool=threadpool) irmio.estimate_suffstats(run_truth.model, run_truth.rng) iter_count = 0 res = [] def logger(iters, model, iter_res): out = { 'model_name': model_name, 'iter': iters, 'seed': seed, 'kernel_name': kernel_name, 'relclass': relclass, 'group_n': GROUP_N, 'entities_per_group': ENTITIES_PER_GROUP } for t, ti in iter_res['kernel_times']: out['kernel_time.%s' % t] = ti res.append(out) run_truth.run_iters(iters_to_run, logger) pickle.dump(res, open(outfile, 'w'))
def run_benchmark(infile, outfile, model_name, latent, data, seed, kernel_name, iters_to_run, GROUP_N, ENTITIES_PER_GROUP, relclass): kernel_config = KERNELS[kernel_name] np.random.seed(seed) new_latent, new_data = irm.data.synth.prior_generate(latent, data) # estimate suffstats from the data if relclass == 'relation': threadpool = None else: threadpool = irm.ThreadPool(8) run_truth = runner.Runner(new_latent, new_data, kernel_config, seed=0, relation_class=RELATION_CLASSES[relclass], threadpool = threadpool) irmio.estimate_suffstats(run_truth.model, run_truth.rng) iter_count = 0 res = [] def logger(iters, model, iter_res): out = {'model_name' : model_name, 'iter' : iters, 'seed' : seed, 'kernel_name' : kernel_name, 'relclass' : relclass, 'group_n' : GROUP_N, 'entities_per_group' : ENTITIES_PER_GROUP} for t, ti in iter_res['kernel_times']: out['kernel_time.%s' % t] = ti res.append(out) run_truth.run_iters(iters_to_run, logger) pickle.dump(res, open(outfile, 'w'))
def test_fixed_k(): # create synthetic data with K groups seed = 0 np.random.seed(seed) GROUP_N = 30 ENTITIES_PER_GROUP = 10 N = GROUP_N * ENTITIES_PER_GROUP model_name = "BetaBernoulli" a = np.random.permutation(np.arange(N) % GROUP_N) latent = {'domains' : {'d1' : {'assignment' : a, }}, 'relations' : {'R1' : {'hps' : {'alpha' : 0.5, 'beta' : 0.5}}}} # ss = {} # for g1 in range(GROUP_N): # for g2 in range(GROUP_N): # ss[(g1, g2)] = {'p' : np.random.beta(0.5, 0.5)} # latent['domains']['d1']['ss'] = ss data = {'domains' : {'d1' : {'N' : N}}, 'relations' : {'R1' : {'relation' : ('d1', 'd1'), 'model' : model_name}}} new_latent, new_data = irm.data.synth.prior_generate(latent, data) # print new_data # m = new_data['relations']['R1']['data'] # f = pylab.figure() # ax = f.add_subplot(1, 2, 1) # ax.imshow(m, interpolation='nearest') # ax2 = f.add_subplot(1, 2, 2) # ai = np.argsort(a).flatten() # m2 = m[ai, :] # m2 = m2[:, ai] # ax2.imshow(m2, interpolation='nearest') # pylab.show() # create model and initialize with that K # score # do inference # does score, assignment vector get better? kernel_config = irm.runner.default_kernel_fixed_config() run_truth = runner.Runner(new_latent, new_data, kernel_config, seed=0, fixed_k=True) irmio.estimate_suffstats(run_truth.model, run_truth.rng) # get ground truth ground_truth_score = run_truth.get_score() cleaned_up_latent = run_truth.get_state() rand_init = copy.deepcopy(cleaned_up_latent) # random init -- just the discrete variables # for the time being for di in cleaned_up_latent['domains']: d_N = len(rand_init['domains'][di]['assignment']) rand_init['domains'][di]['assignment'] = np.random.permutation(np.arange(d_N) % GROUP_N) for ri in cleaned_up_latent['relations']: del rand_init['relations'][ri]['ss'] run_actual = runner.Runner(rand_init, new_data, kernel_config, seed=seed) rand_init_score = run_actual.get_score() print "rand_init_score=", rand_init_score print "ground_truth_score=", ground_truth_score assert_greater(ground_truth_score, rand_init_score ) iter_count = 0 ITER_OVER = 1000 ITERS_TO_RUN = 1 while (run_actual.get_score() - ground_truth_score) < -50: # well this is sort of bullshit run_actual.run_iters(ITERS_TO_RUN) iter_count += ITERS_TO_RUN print iter_count, model_name, run_actual.get_score(), ground_truth_score assert_less(iter_count, ITERS_TO_RUN*ITER_OVER, "Too many iterations to get good score")
def test_fixed_k(): # create synthetic data with K groups seed = 0 np.random.seed(seed) GROUP_N = 30 ENTITIES_PER_GROUP = 10 N = GROUP_N * ENTITIES_PER_GROUP model_name = "BetaBernoulli" a = np.random.permutation(np.arange(N) % GROUP_N) latent = { 'domains': { 'd1': { 'assignment': a, } }, 'relations': { 'R1': { 'hps': { 'alpha': 0.5, 'beta': 0.5 } } } } # ss = {} # for g1 in range(GROUP_N): # for g2 in range(GROUP_N): # ss[(g1, g2)] = {'p' : np.random.beta(0.5, 0.5)} # latent['domains']['d1']['ss'] = ss data = { 'domains': { 'd1': { 'N': N } }, 'relations': { 'R1': { 'relation': ('d1', 'd1'), 'model': model_name } } } new_latent, new_data = irm.data.synth.prior_generate(latent, data) # print new_data # m = new_data['relations']['R1']['data'] # f = pylab.figure() # ax = f.add_subplot(1, 2, 1) # ax.imshow(m, interpolation='nearest') # ax2 = f.add_subplot(1, 2, 2) # ai = np.argsort(a).flatten() # m2 = m[ai, :] # m2 = m2[:, ai] # ax2.imshow(m2, interpolation='nearest') # pylab.show() # create model and initialize with that K # score # do inference # does score, assignment vector get better? kernel_config = irm.runner.default_kernel_fixed_config() run_truth = runner.Runner(new_latent, new_data, kernel_config, seed=0, fixed_k=True) irmio.estimate_suffstats(run_truth.model, run_truth.rng) # get ground truth ground_truth_score = run_truth.get_score() cleaned_up_latent = run_truth.get_state() rand_init = copy.deepcopy(cleaned_up_latent) # random init -- just the discrete variables # for the time being for di in cleaned_up_latent['domains']: d_N = len(rand_init['domains'][di]['assignment']) rand_init['domains'][di]['assignment'] = np.random.permutation( np.arange(d_N) % GROUP_N) for ri in cleaned_up_latent['relations']: del rand_init['relations'][ri]['ss'] run_actual = runner.Runner(rand_init, new_data, kernel_config, seed=seed) rand_init_score = run_actual.get_score() print "rand_init_score=", rand_init_score print "ground_truth_score=", ground_truth_score assert_greater(ground_truth_score, rand_init_score) iter_count = 0 ITER_OVER = 1000 ITERS_TO_RUN = 1 while (run_actual.get_score() - ground_truth_score) < -50: # well this is sort of bullshit run_actual.run_iters(ITERS_TO_RUN) iter_count += ITERS_TO_RUN print iter_count, model_name, run_actual.get_score( ), ground_truth_score assert_less(iter_count, ITERS_TO_RUN * ITER_OVER, "Too many iterations to get good score")