def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, \ residency_priors, num_iter, burn_in, from_=0, to=np.inf): ''' Learns the latent topics from a temporal hypergraph trace. Node-Sherlock is a EM algorithm in which the E-Step is a gibbs sample update, thus the reason we use a `num_iter` and `burn_in` approach, instead of the usual convergence approach. Parameters ---------- trace_fpath : str The path of the trace. Each line should be a \ (timestamp, hypernode, source, destination) where the \ timestamp is a long (seconds or milliseconds from epoch). num_topics : int The number of latent spaces to learn alpha_zh : float The value of the alpha_zh hyperparameter beta_zs : float The value of the beta_zs (beta) hyperaparameter kernel : Kernel object The kernel to use residency_priors : array of float The kernel hyper parameters num_iter : int The number of iterations to learn the model from burn_in : int The burn_in of the chain Returns ------- A dictionary with the results. ''' Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \ hyper2id, source2id = \ dataio.initialize_trace(trace_fpath, num_topics, num_iter, \ from_, to) em(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ prob_topics_aux, Theta_zh, Psi_sz, num_iter, \ burn_in, kernel) rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \ kernel, residency_priors, num_iter, burn_in, Dts, Trace, \ Count_zh, Count_sz, count_h, \ count_z, prob_topics_aux, Theta_zh, Psi_sz, hyper2id, \ source2id, from_, to) rv['algorithm'] = np.asarray(['serial gibbs + em']) return rv
def sample(Dts, Trace, Count_zh, Count_sz_local, \ count_h, count_z, alpha_zh, beta_zs, kernel, num_iter, comm): previous_encounters_s = {} for other_processor in xrange(1, comm.size): previous_encounters_s[other_processor] = np.zeros_like(Count_sz_local) stamps = StampLists(Count_zh.shape[0]) for z in xrange(Count_zh.shape[0]): idx = Trace[:, -1] == z #dts_assigned = Dts[idx:, 0].ravel().copy() #np.sort(dts_assigned) stamps._extend(z, Dts[idx][:, -1]) aux = np.zeros(Count_zh.shape[0], dtype='f8') Count_sz_pair = np.zeros_like(Count_sz_local) Count_sz_others = np.zeros_like(Count_sz_local) Count_sz_sum = np.zeros_like(Count_sz_local) Theta_zh = np.zeros_like(Count_zh, dtype='f8') Psi_sz = np.zeros_like(Count_sz_local, dtype='f8') can_pair = True for i in xrange(num_iter // CACHE_SIZE): #Sample from the local counts and encountered counts Count_sz_sum[:] = Count_sz_local + Count_sz_others count_z[:] = Count_sz_sum.sum(axis=0) em(Dts, Trace, stamps, Count_zh, Count_sz_sum, \ count_h, count_z, alpha_zh, beta_zs, aux, Theta_zh, \ Psi_sz, CACHE_SIZE, CACHE_SIZE * 2, kernel, False) #Update local counts Count_sz_local[:] = Count_sz_sum - Count_sz_others count_z[:] = Count_sz_local.sum(axis=0) #Update expected belief of other processors if can_pair: P_local = kernel.get_state() can_pair = paired_update(comm, previous_encounters_s, \ Count_sz_local, Count_sz_pair, Count_sz_others, \ P_local, np.zeros_like(P_local)) kernel.update_state(P_local)
def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, \ residency_priors, num_iter, num_batches, mpi_mode, from_=0, to=np.inf): ''' Learns the latent topics from a temporal hypergraph trace. Here we do a asynchronous learning of the topics similar to AD-LDA, as well as the dynamic topic expansion/pruing. Parameters ---------- trace_fpath : str The path of the trace. Each line should be a \ (timestamp, hypernode, source, destination) where the \ timestamp is a long (seconds or milliseconds from epoch). num_topics : int The number of latent spaces to learn alpha_zh : float The value of the alpha_zh hyperparameter beta_zs : float The value of the beta_zs (beta) hyperaparameter kernel : Kernel object The kernel to use residency_priors : array of float The kernel hyper parameters num_iter : int The number of iterations to learn the model from num_batches : int Defines the number of batches of size num_iter Returns ------- A dictionary with the results. ''' assert num_batches >= 2 comm = MPI.COMM_WORLD num_workers = comm.size - 1 Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \ hyper2id, source2id = \ dataio.initialize_trace(trace_fpath, num_topics, num_iter, \ from_, to) if mpi_mode: workloads = generate_workload(Count_zh.shape[1], num_workers, Trace) all_idx = np.arange(Trace.shape[0], dtype='i4') for batch in xrange(num_batches): print('Now at batch', batch) if mpi_mode: for worker_id in xrange(1, num_workers + 1): comm.send(num_iter, dest=worker_id, tag=Msg.LEARN.value) dispatch_jobs(Dts, Trace, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, kernel, \ residency_priors, workloads, num_workers, comm) manage(comm, num_workers) fetch_results(comm, num_workers, workloads, Dts, Trace, \ previous_stamps, Count_zh, Count_sz, count_h, \ count_z, alpha_zh, beta_zs, Theta_zh, Psi_sz, \ kernel) else: prob_topics_aux = np.zeros(Count_zh.shape[0], dtype='f8') _learn.em(Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, \ prob_topics_aux, Theta_zh, Psi_sz, num_iter, \ num_iter * 2, kernel, False) print('Split') ll_per_z = np.zeros(count_z.shape[0], dtype='f8') _eval.quality_estimate(Dts, Trace, previous_stamps, \ Count_zh, Count_sz, count_h, count_z, alpha_zh, \ beta_zs, ll_per_z, all_idx, kernel) Trace, Count_zh, Count_sz, count_z, previous_stamps, \ P = split(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ ll_per_z, kernel) kernel = kernel.__class__() kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors) if residency_priors.shape[0] > 0: kernel.update_state(P) print('Merge') ll_per_z = np.zeros(count_z.shape[0], dtype='f8') _eval.quality_estimate(Dts, Trace, previous_stamps, \ Count_zh, Count_sz, count_h, count_z, alpha_zh, \ beta_zs, ll_per_z, all_idx, kernel) Trace, Count_zh, Count_sz, count_z, previous_stamps, \ P = merge(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ ll_per_z, kernel) kernel = kernel.__class__() kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors) if residency_priors.shape[0] > 0: kernel.update_state(P) Theta_zh = np.zeros(shape=Count_zh.shape, dtype='f8') Psi_sz = np.zeros(shape=Count_sz.shape, dtype='f8') if batch == num_batches - 1: print('Computing probs') _learn._aggregate(Count_zh, Count_sz, count_h, count_z, \ alpha_zh, beta_zs, Theta_zh, Psi_sz) print('New nz', Count_zh.shape[0]) if mpi_mode: for worker_id in xrange(1, num_workers + 1): comm.send(num_iter, dest=worker_id, tag=Msg.STOP.value) rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \ kernel, residency_priors, num_iter, -1, Dts, Trace, \ Count_zh, Count_sz, count_h, count_z, prob_topics_aux, Theta_zh, \ Psi_sz, hyper2id, source2id, from_, to) rv['num_workers'] = np.asarray([num_workers]) rv['num_batches'] = np.asarray([num_batches]) rv['algorithm'] = np.asarray(['parallel dynamic']) return rv