def test_estep(): tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \ dataio.initialize_trace(files.SIZE10, 2, 10) kernel = NoopKernel() kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d')) alpha_zh = .1 beta_zs = .1 assert_equal(Count_zh.sum(), 10) assert_equal(Count_sz.sum(), 20) assert_equal(count_h[0], 4) assert_equal(count_h[1], 4) assert_equal(count_h[2], 2) new_state = _learn._e_step(tstamps, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, prob_topics_aux, \ kernel) assert_equal(count_h[0], 4) assert_equal(count_h[1], 4) assert_equal(count_h[2], 2) assert_equal(Count_zh.sum(), 10) assert_equal(Count_sz.sum(), 20)
def test_em(): tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \ dataio.initialize_trace(files.SIZE10, 2, 10) kernel = NoopKernel() kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d')) alpha_zh = .1 beta_zs = .1 assert (Theta_zh == 0).all() assert (Psi_sz == 0).all() old_Count_zh = Count_zh.copy() old_Count_sz = Count_sz.copy() old_count_h = count_h.copy() old_count_z = count_z.copy() _learn.em(tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, \ count_z, alpha_zh, beta_zs, prob_topics_aux, Theta_zh, Psi_sz, \ 10, 2, kernel) assert (Theta_zh > 0).sum() > 0 assert (Psi_sz > 0).sum() > 0 assert_almost_equal(1, Theta_zh.sum(axis=0)) assert_almost_equal(1, Psi_sz.sum(axis=0)) assert (old_Count_zh != Count_zh).any() assert (old_Count_sz != Count_sz).any() assert (old_count_h == count_h).all() #the count_h should not change
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, \ residency_priors, num_iter, burn_in, from_=0, to=np.inf): ''' Learns the latent topics from a temporal hypergraph trace. Node-Sherlock is a EM algorithm in which the E-Step is a gibbs sample update, thus the reason we use a `num_iter` and `burn_in` approach, instead of the usual convergence approach. Parameters ---------- trace_fpath : str The path of the trace. Each line should be a \ (timestamp, hypernode, source, destination) where the \ timestamp is a long (seconds or milliseconds from epoch). num_topics : int The number of latent spaces to learn alpha_zh : float The value of the alpha_zh hyperparameter beta_zs : float The value of the beta_zs (beta) hyperaparameter kernel : Kernel object The kernel to use residency_priors : array of float The kernel hyper parameters num_iter : int The number of iterations to learn the model from burn_in : int The burn_in of the chain Returns ------- A dictionary with the results. ''' Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \ hyper2id, source2id = \ dataio.initialize_trace(trace_fpath, num_topics, num_iter, \ from_, to) em(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ prob_topics_aux, Theta_zh, Psi_sz, num_iter, \ burn_in, kernel) rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \ kernel, residency_priors, num_iter, burn_in, Dts, Trace, \ Count_zh, Count_sz, count_h, \ count_z, prob_topics_aux, Theta_zh, Psi_sz, hyper2id, \ source2id, from_, to) rv['algorithm'] = np.asarray(['serial gibbs + em']) return rv
def test_split(): tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \ dataio.initialize_trace(files.SIZE10, 2, 10) alpha_zh, beta_zs, a_ptz, b_ptz = [0.1] * 4 ll_per_z = np.zeros(2, dtype='f8') Trace[:, -1] = 0 previous_stamps._clear() tstamps = np.array([[1.0, 2.0, 3.0, 4.0, 5.0, 100, 200, 300, 400, 500]]).T tstamps = np.array(tstamps, order='C') previous_stamps._extend(0, tstamps[:, 0]) Count_zh = np.zeros(shape=(1, Count_zh.shape[1]), dtype='i4') Count_sz = np.zeros(shape=(Count_sz.shape[0], 1), dtype='i4') count_z = np.zeros(shape=(1, ), dtype='i4') fast_populate(Trace, Count_zh, Count_sz, count_h, count_z) kernel = ECCDFKernel() kernel.build(Trace.shape[0], Count_zh.shape[0], \ np.array([1.0, Count_zh.shape[0] - 1])) ll_per_z = np.zeros(1, dtype='f8') quality_estimate(tstamps, Trace, \ previous_stamps, Count_zh, Count_sz, count_h, \ count_z, alpha_zh, beta_zs, \ ll_per_z, np.arange(Trace.shape[0], dtype='i4'), \ kernel) Trace_new, Count_zh_new, Count_sz_new, count_z_new, \ new_stamps, _ = \ dynamic.split(tstamps, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, \ ll_per_z, kernel, .5, 0) assert_array_equal(Trace_new[:, -1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) assert_array_equal(new_stamps._get_all(0), [1, 2, 3, 4, 5]) assert_array_equal(new_stamps._get_all(1), [100, 200, 300, 400, 500]) assert Count_zh_new.shape[0] > Count_zh.shape[0] assert Count_zh_new.shape[1] == Count_zh.shape[1] assert Count_zh_new[0].sum() == 5 assert Count_zh_new[1].sum() == 5 assert Count_sz_new.shape[0] == Count_sz.shape[0] assert Count_sz_new.shape[1] > Count_sz.shape[1] assert Count_sz_new[:, 0].sum() == 10 assert Count_sz_new[:, 1].sum() == 10 assert count_z_new.shape[0] > count_z.shape[0] assert count_z_new[0] == 10 assert count_z_new[1] == 10
def test_correlate_all(): tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \ dataio.initialize_trace(files.SIZE10, 2, 10) C = dynamic.correlate_counts(Count_zh, Count_sz, count_h, count_z, .1, \ .1) assert_equal((2, 2), C.shape) assert C[0, 1] != 0 assert (np.tril(C) == 0).all()
def test_merge(): tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \ dataio.initialize_trace(files.SIZE10, 2, 10) kernel = ECCDFKernel() kernel.build(Trace.shape[0], Count_zh.shape[0], \ np.array([1.0, Count_zh.shape[0] - 1])) Trace[:, 0] = 0 Trace[:, 1] = 0 Trace[:, 2] = 0 Trace[:, 3] = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] Count_sz[:] = 0 Count_zh[:] = 0 count_z[:] = 0 count_h[:] = 0 fast_populate(Trace, Count_zh, Count_sz, count_h, count_z) C = dynamic.correlate_counts(Count_zh, Count_sz, count_h, count_z, .1, \ .1) alpha_zh, beta_zs, beta_zd = [0.1] * 3 a_ptz = 1.0 b_ptz = Count_zh.shape[0] - 1 ll_per_z = np.zeros(2, dtype='f8') quality_estimate(tstamps, Trace, \ previous_stamps, Count_zh, Count_sz, count_h, \ count_z, alpha_zh, beta_zd, ll_per_z, \ np.arange(Trace.shape[0], dtype='i4'), kernel) Trace_new, Count_zh_new, Count_sz_new, \ count_z_new, new_stamps, _ = \ dynamic.merge(tstamps, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, ll_per_z, kernel) print(Trace_new) print(np.array(new_stamps._get_all(0))) assert len(new_stamps._get_all(0)) == 10 assert Count_zh_new.shape[0] < Count_zh.shape[0] assert Count_zh_new.shape[1] == Count_zh.shape[1] assert Count_sz_new.shape[0] == Count_sz.shape[0] assert Count_sz_new.shape[1] < Count_sz.shape[0] assert count_z_new.shape[0] < count_z.shape[0]
def test_sample(): tstamps, Trace, previous_stamps, Count_zh, Count_sz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_sz, hyper2id, source2id = \ dataio.initialize_trace(files.SIZE10, 2, 10) kernel = NoopKernel() kernel.build(Trace.shape[0], Count_zh.shape[1], np.zeros(0, dtype='d')) tstamp_idx = 3 hyper = Trace[tstamp_idx, 0] source = Trace[tstamp_idx, 1] dest = Trace[tstamp_idx, 2] old_topic = Trace[tstamp_idx, 3] new_topic = _learn._sample(tstamp_idx, tstamps, Trace, \ previous_stamps, Count_zh, Count_sz, count_h, \ count_z, .1, .1, prob_topics_aux, kernel) assert new_topic <= 3
def test_initialize(): tstamps, Trace, previous_stamps, Count_zh, Count_oz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_oz, hyper2id, obj2id = \ dataio.initialize_trace(files.SIZE10, 2, 10) assert_equal(len(hyper2id), 3) assert_equal(len(obj2id), 6) assert_equal(Trace.shape[0], 10) assert_equal(Trace.shape[1], 4) for z in [0, 1]: assert previous_stamps._size(z) > 0 assert_equal(count_h[0], 4) assert_equal(count_h[1], 4) assert_equal(count_h[2], 2) assert_equal(count_z.sum(), 20) #depends on memory #We can only test shapes and sum, since assignments are random assert_equal(Count_zh.shape, (2, 3)) assert_equal(Count_oz.shape, (6, 2)) assert_equal(Count_zh.sum(), 10) assert_equal(Count_oz.sum(), 20) assert (prob_topics_aux == 0).all() #Simple sanity check on topic assigmnets. Check if topics have valid #ids and if count matches count matrix from collections import Counter c = Counter(Trace[:, -1]) for topic in c: assert topic in [0, 1] assert c[topic] == count_z[topic] / 2
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, \ residency_priors, num_iter, num_batches, mpi_mode, from_=0, to=np.inf): ''' Learns the latent topics from a temporal hypergraph trace. Here we do a asynchronous learning of the topics similar to AD-LDA, as well as the dynamic topic expansion/pruing. Parameters ---------- trace_fpath : str The path of the trace. Each line should be a \ (timestamp, hypernode, source, destination) where the \ timestamp is a long (seconds or milliseconds from epoch). num_topics : int The number of latent spaces to learn alpha_zh : float The value of the alpha_zh hyperparameter beta_zs : float The value of the beta_zs (beta) hyperaparameter kernel : Kernel object The kernel to use residency_priors : array of float The kernel hyper parameters num_iter : int The number of iterations to learn the model from num_batches : int Defines the number of batches of size num_iter Returns ------- A dictionary with the results. ''' assert num_batches >= 2 comm = MPI.COMM_WORLD num_workers = comm.size - 1 Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \ hyper2id, source2id = \ dataio.initialize_trace(trace_fpath, num_topics, num_iter, \ from_, to) if mpi_mode: workloads = generate_workload(Count_zh.shape[1], num_workers, Trace) all_idx = np.arange(Trace.shape[0], dtype='i4') for batch in xrange(num_batches): print('Now at batch', batch) if mpi_mode: for worker_id in xrange(1, num_workers + 1): comm.send(num_iter, dest=worker_id, tag=Msg.LEARN.value) dispatch_jobs(Dts, Trace, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, kernel, \ residency_priors, workloads, num_workers, comm) manage(comm, num_workers) fetch_results(comm, num_workers, workloads, Dts, Trace, \ previous_stamps, Count_zh, Count_sz, count_h, \ count_z, alpha_zh, beta_zs, Theta_zh, Psi_sz, \ kernel) else: prob_topics_aux = np.zeros(Count_zh.shape[0], dtype='f8') _learn.em(Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, \ prob_topics_aux, Theta_zh, Psi_sz, num_iter, \ num_iter * 2, kernel, False) print('Split') ll_per_z = np.zeros(count_z.shape[0], dtype='f8') _eval.quality_estimate(Dts, Trace, previous_stamps, \ Count_zh, Count_sz, count_h, count_z, alpha_zh, \ beta_zs, ll_per_z, all_idx, kernel) Trace, Count_zh, Count_sz, count_z, previous_stamps, \ P = split(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ ll_per_z, kernel) kernel = kernel.__class__() kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors) if residency_priors.shape[0] > 0: kernel.update_state(P) print('Merge') ll_per_z = np.zeros(count_z.shape[0], dtype='f8') _eval.quality_estimate(Dts, Trace, previous_stamps, \ Count_zh, Count_sz, count_h, count_z, alpha_zh, \ beta_zs, ll_per_z, all_idx, kernel) Trace, Count_zh, Count_sz, count_z, previous_stamps, \ P = merge(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ ll_per_z, kernel) kernel = kernel.__class__() kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors) if residency_priors.shape[0] > 0: kernel.update_state(P) Theta_zh = np.zeros(shape=Count_zh.shape, dtype='f8') Psi_sz = np.zeros(shape=Count_sz.shape, dtype='f8') if batch == num_batches - 1: print('Computing probs') _learn._aggregate(Count_zh, Count_sz, count_h, count_z, \ alpha_zh, beta_zs, Theta_zh, Psi_sz) print('New nz', Count_zh.shape[0]) if mpi_mode: for worker_id in xrange(1, num_workers + 1): comm.send(num_iter, dest=worker_id, tag=Msg.STOP.value) rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \ kernel, residency_priors, num_iter, -1, Dts, Trace, \ Count_zh, Count_sz, count_h, count_z, prob_topics_aux, Theta_zh, \ Psi_sz, hyper2id, source2id, from_, to) rv['num_workers'] = np.asarray([num_workers]) rv['num_batches'] = np.asarray([num_batches]) rv['algorithm'] = np.asarray(['parallel dynamic']) return rv
def test_initialize_limits(): tstamps, Trace, previous_stamps, Count_zh, Count_oz, count_h, count_z, \ prob_topics_aux, Theta_zh, Psi_oz, hyper2id, obj2id = \ dataio.initialize_trace(files.SIZE10, 2, 10, 2, 5) assert len(tstamps) == 3
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, residency_priors, \ num_iter, from_=0, to=np.inf): ''' Learns the latent topics from a temporal hypergraph trace. Here we do a asynchronous learning of the topics similar to AD-LDA. An even number of threads is required. Parameters ---------- trace_fpath : str The path of the trace. Each line should be a (timestamp, hypernode, source, destination) where the timestamp is a long (seconds or milliseconds from epoch). num_topics : int The number of latent spaces to learn alpha_zh : float The value of the alpha_zh hyperparameter beta_zs : float The value of the beta_zs (beta) hyperaparameter kernel : Kernel object The kernel to use residency_priors : array of float The kernel hyper parameters num_iter : int The number of iterations to learn the model from Returns ------- A dictionary with the results. ''' comm = MPI.COMM_WORLD num_workers = comm.size - 1 Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \ hyper2id, source2id = \ dataio.initialize_trace(trace_fpath, num_topics, num_iter, \ from_, to) for worker_id in xrange(1, num_workers + 1): comm.send(num_iter, dest=worker_id, tag=Msg.LEARN.value) workloads = generate_workload(Count_zh.shape[1], num_workers, Trace) dispatch_jobs(Dts, Trace, Count_zh, Count_sz, count_h, \ count_z, alpha_zh, beta_zs, kernel, residency_priors, \ workloads, num_workers, comm) manage(comm, num_workers) fetch_results(comm, num_workers, workloads, Dts, Trace, previous_stamps, \ Count_zh, Count_sz, count_h, count_z, alpha_zh, \ beta_zs, Theta_zh, Psi_sz, kernel) for worker_id in xrange(1, num_workers + 1): comm.send(worker_id, dest=worker_id, tag=Msg.STOP.value) rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \ kernel, residency_priors, num_iter, -1, Dts, \ Trace, Count_zh, Count_sz, count_h, \ count_z, prob_topics_aux, Theta_zh, Psi_sz, hyper2id, \ source2id, from_, to) rv['num_workers'] = np.asarray([num_workers]) rv['algorithm'] = np.asarray(['parallel gibbs + em']) return rv