def fit(trace_fpath, num_topics, alpha_zh, beta_zs, kernel, \ residency_priors, num_iter, num_batches, mpi_mode, from_=0, to=np.inf): ''' Learns the latent topics from a temporal hypergraph trace. Here we do a asynchronous learning of the topics similar to AD-LDA, as well as the dynamic topic expansion/pruing. Parameters ---------- trace_fpath : str The path of the trace. Each line should be a \ (timestamp, hypernode, source, destination) where the \ timestamp is a long (seconds or milliseconds from epoch). num_topics : int The number of latent spaces to learn alpha_zh : float The value of the alpha_zh hyperparameter beta_zs : float The value of the beta_zs (beta) hyperaparameter kernel : Kernel object The kernel to use residency_priors : array of float The kernel hyper parameters num_iter : int The number of iterations to learn the model from num_batches : int Defines the number of batches of size num_iter Returns ------- A dictionary with the results. ''' assert num_batches >= 2 comm = MPI.COMM_WORLD num_workers = comm.size - 1 Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_sz, \ hyper2id, source2id = \ dataio.initialize_trace(trace_fpath, num_topics, num_iter, \ from_, to) if mpi_mode: workloads = generate_workload(Count_zh.shape[1], num_workers, Trace) all_idx = np.arange(Trace.shape[0], dtype='i4') for batch in xrange(num_batches): print('Now at batch', batch) if mpi_mode: for worker_id in xrange(1, num_workers + 1): comm.send(num_iter, dest=worker_id, tag=Msg.LEARN.value) dispatch_jobs(Dts, Trace, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, kernel, \ residency_priors, workloads, num_workers, comm) manage(comm, num_workers) fetch_results(comm, num_workers, workloads, Dts, Trace, \ previous_stamps, Count_zh, Count_sz, count_h, \ count_z, alpha_zh, beta_zs, Theta_zh, Psi_sz, \ kernel) else: prob_topics_aux = np.zeros(Count_zh.shape[0], dtype='f8') _learn.em(Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, \ prob_topics_aux, Theta_zh, Psi_sz, num_iter, \ num_iter * 2, kernel, False) print('Split') ll_per_z = np.zeros(count_z.shape[0], dtype='f8') _eval.quality_estimate(Dts, Trace, previous_stamps, \ Count_zh, Count_sz, count_h, count_z, alpha_zh, \ beta_zs, ll_per_z, all_idx, kernel) Trace, Count_zh, Count_sz, count_z, previous_stamps, \ P = split(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ ll_per_z, kernel) kernel = kernel.__class__() kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors) if residency_priors.shape[0] > 0: kernel.update_state(P) print('Merge') ll_per_z = np.zeros(count_z.shape[0], dtype='f8') _eval.quality_estimate(Dts, Trace, previous_stamps, \ Count_zh, Count_sz, count_h, count_z, alpha_zh, \ beta_zs, ll_per_z, all_idx, kernel) Trace, Count_zh, Count_sz, count_z, previous_stamps, \ P = merge(Dts, Trace, previous_stamps, Count_zh, \ Count_sz, count_h, count_z, alpha_zh, beta_zs, \ ll_per_z, kernel) kernel = kernel.__class__() kernel.build(Trace.shape[0], Count_zh.shape[0], residency_priors) if residency_priors.shape[0] > 0: kernel.update_state(P) Theta_zh = np.zeros(shape=Count_zh.shape, dtype='f8') Psi_sz = np.zeros(shape=Count_sz.shape, dtype='f8') if batch == num_batches - 1: print('Computing probs') _learn._aggregate(Count_zh, Count_sz, count_h, count_z, \ alpha_zh, beta_zs, Theta_zh, Psi_sz) print('New nz', Count_zh.shape[0]) if mpi_mode: for worker_id in xrange(1, num_workers + 1): comm.send(num_iter, dest=worker_id, tag=Msg.STOP.value) rv = prepare_results(trace_fpath, num_topics, alpha_zh, beta_zs, \ kernel, residency_priors, num_iter, -1, Dts, Trace, \ Count_zh, Count_sz, count_h, count_z, prob_topics_aux, Theta_zh, \ Psi_sz, hyper2id, source2id, from_, to) rv['num_workers'] = np.asarray([num_workers]) rv['num_batches'] = np.asarray([num_batches]) rv['algorithm'] = np.asarray(['parallel dynamic']) return rv
def split(Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, ll_per_z, kernel, \ perc=0.05, min_stamps=50): nz = Count_zh.shape[0] nh = Count_zh.shape[1] ns = Count_sz.shape[0] assert nz == ll_per_z.shape[0] idx_int_all = np.arange(Trace.shape[0], dtype='i4') #Initiate auxiliary matrices Count_zh_spl = np.zeros(shape=(nz + 1, nh), dtype='i4') Count_sz_spl = np.zeros(shape=(ns, nz + 1), dtype='i4') count_z_spl = np.zeros(nz + 1, dtype='i4') Count_zh_spl[:-1, :] = Count_zh Count_sz_spl[:, :-1] = Count_sz count_z_spl[:-1] = count_z ll_per_z_new = np.zeros(nz + 1, dtype='f8') ll_per_z_new[:-1] = ll_per_z new_stamps = StampLists(nz + 1) for z in xrange(nz): new_stamps._extend(z, previous_stamps._get_all(z)) splitted = Trace[:, -1].copy() shift = 0 #Do the splits per topic for z in xrange(nz): #Candidates for removal topic_stamps = np.asanyarray(previous_stamps._get_all(z)) idx = Trace[:, -1] == z assert topic_stamps.shape[0] == idx.sum() argsrt = topic_stamps.argsort() top = int(np.ceil(perc * topic_stamps.shape[0])) #If not at least min stamps, exit, not enough for a CCDF estimation if top < min_stamps: continue #Populate stamps new_stamps._clear_one(z) new_stamps._clear_one(nz) new_stamps._extend(z, topic_stamps[:-top]) new_stamps._extend(nz, topic_stamps[-top:]) #Split topic on the Trace. The trace has to be sorted by timestamp!! old_assign = Trace[:, -1][idx].copy() new_assign = Trace[:, -1][idx].copy() new_assign[-top:] = nz Trace[:, -1][idx] = new_assign #Update matrices. Can't really vectorize this :( for line in Trace[idx][-top:]: h = line[0] Count_zh_spl[z, h] -= 1 for o in line[1:-1]: Count_sz_spl[o, z] -= 1 count_z_spl[z] -= 1 Count_zh_spl[nz, h] += 1 for o in line[1:-1]: Count_sz_spl[o, nz] += 1 count_z_spl[nz] += 1 #New LL ll_per_z_new[z] = 0 ll_per_z_new[-1] = 0 idx_int = idx_int_all[idx] _eval.quality_estimate(Dts, Trace, \ new_stamps, Count_zh_spl, Count_sz_spl, count_h, \ count_z_spl, alpha_zh, beta_zs, \ ll_per_z_new, idx_int, kernel) if ll_per_z_new.sum() > ll_per_z.sum(): new_assign[-top:] = nz + shift splitted[idx] = new_assign shift += 1 #Revert trace new_stamps._clear_one(z) new_stamps._clear_one(nz) new_stamps._extend(z, previous_stamps._get_all(z)) Count_zh_spl[:-1, :] = Count_zh Count_sz_spl[:, :-1] = Count_sz count_z_spl[:-1] = count_z Count_zh_spl[-1, :] = 0 Count_sz_spl[:, -1] = 0 count_z_spl[-1] = 0 ll_per_z_new[z] = ll_per_z[z] ll_per_z_new[-1] = 0 Trace[:, -1][idx] = old_assign return finalize_splits(nz, shift, splitted, Dts, Trace, nh, ns, kernel)
def merge(Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, ll_per_z, kernel): nz = Count_zh.shape[0] nh = Count_zh.shape[1] ns = Count_sz.shape[0] idx_int_all = np.arange(Trace.shape[0], dtype='i4') #Get the nz most similar C = correlate_counts(Count_zh, Count_sz, count_h, count_z, \ alpha_zh, beta_zs) #k = int(np.ceil(np.sqrt(nz))) idx_dim1, idx_dim2 = \ np.unravel_index(C.flatten().argsort()[-nz:][::-1], C.shape) top_sims = zip(idx_dim1, idx_dim2) #New info new_stamps = previous_stamps.copy() Count_zh_mrg = Count_zh.copy() Count_sz_mrg = Count_sz.copy() count_z_mrg = count_z.copy() #Test merges merged = set() accepted = set() for z1, z2 in top_sims: if z1 in merged or z2 in merged: continue if C[z1, z2] <= 0: #already at nonsimilar break Count_zh_mrg[:] = Count_zh Count_sz_mrg[:] = Count_sz count_z_mrg[:] = count_z #Merge z1 and z2 Count_zh_mrg[z1] += Count_zh[z2] Count_sz_mrg[:, z1] += Count_sz[:, z2] count_z_mrg[z1] += count_z[z2] #Remove z2 Count_zh_mrg[z2] = 0 Count_sz_mrg[:, z2] = 0 count_z_mrg[z2] = 0 idx = Trace[:, -1] == z2 Trace[:, -1][idx] = z1 #get stamps for llhood idx_int = idx_int_all[idx] new_stamps._extend(z1, previous_stamps._get_all(z2)) new_stamps._clear_one(z2) #New likelihood ll_per_z_new = ll_per_z.copy() ll_per_z_new[z2] = 0 _eval.quality_estimate(Dts, Trace, \ new_stamps, Count_zh_mrg, Count_sz_mrg, count_h, \ count_z_mrg, alpha_zh, beta_zs, \ ll_per_z_new, idx_int, kernel) if ll_per_z_new.sum() > ll_per_z.sum(): merged.add(z1) merged.add(z2) accepted.add((z1, z2)) #Revert trace Trace[:, -1][idx] = z2 new_stamps._clear_one(z1) new_stamps._clear_one(z2) new_stamps._extend(z1, previous_stamps._get_all(z1)) new_stamps._extend(z2, previous_stamps._get_all(z2)) return finalize_merge(nz, accepted, Dts, Trace, nh, ns, kernel)