def finalize_splits(nz, n_splits, splitted, Dts, Trace, nh, ns, kernel): new_nz = nz + n_splits if kernel.get_priors().shape[0] > 0: new_P = [row for row in kernel.get_state()] for _ in xrange(n_splits): new_P.append(kernel.get_priors()) else: new_P = kernel.get_state() Trace[:, -1] = splitted #Populate new counts Count_zh_new = np.zeros(shape=(new_nz, nh), dtype='i4') Count_sz_new = np.zeros(shape=(ns, new_nz), dtype='i4') count_z_new = np.zeros(new_nz, dtype='i4') count_h_new = np.zeros(nh, dtype='i4') _learn.fast_populate(Trace, Count_zh_new, Count_sz_new, \ count_h_new, count_z_new) new_stamps = StampLists(new_nz) for z in xrange(new_nz): idx = Trace[:, -1] == z topic_stamps = Dts[idx] new_stamps._extend(z, topic_stamps[:, -1]) return Trace, Count_zh_new, Count_sz_new, \ count_z_new, new_stamps, np.array(new_P)
def test_all(): slists = StampLists(4) E = [] for i in xrange(4): E.append([]) for _ in xrange(200): e = np.random.rand() slists._append(i, e) E[i].append(e) kern = ECCDFKernel(True) kern.build(800, 4, np.array([1.0, 3.0])) kern._mstep(slists) for i in xrange(4): assert_array_equal(sorted(E[i]), slists._get_all(i)) for i in xrange(4): for _ in xrange(200): p = kern._pdf(i, np.random.rand(), slists) assert p <= 1 assert p >= 0
def test_pdf(): for mu in [0, 1, 10]: for v in [1, 10]: for std in [1, 10]: priors = np.array([mu, v, std], dtype='d') kernel = TStudentKernel() kernel.build(999, 1, priors) #99... is just the max freed truth = t(v, loc=mu, scale=std) for x in np.linspace(-100, 100, 200): print(mu, v, std, x, truth.pdf(x), kernel._pdf(x, 0, StampLists(1))) assert_almost_equal(truth.pdf(x), \ kernel._pdf(x, 0, StampLists(1)))
def main(model, out_fpath_rrs, out_fpath_pred): store = pd.HDFStore(model) trace_fpath = store['trace_fpath'][0][0] tsFile = open( '/home/zahran/Desktop/tribeFlow/zahranData/lastfm-dataset-1K/PARSED_74123_B10_zahran_sampledData', 'r') sequenceLength = 10 from_ = store['from_'][0][0] to = store['to'][0][0] assert from_ == 0 kernel_class = store['kernel_class'][0][0] kernel_class = eval(kernel_class) Theta_zh = store['Theta_zh'].values Psi_sz = store['Psi_sz'].values count_z = store['count_z'].values[:, 0] P = store['P'].values residency_priors = store['residency_priors'].values[:, 0] previous_stamps = StampLists( count_z.shape[0]) #previous_stamps has a length = nz true_mem_size = store['Dts'].values.shape[1] tstamps = store[ 'Dts'].values[:, 0] #tstamps (#trainingLines,) contains the t(xi)-t(xi-1) assign = store[ 'assign'].values[:, 0] # assign (#trainingLines,) each dim has the env id used in that training instance for z in xrange(count_z.shape[0]): idx = assign == z previous_stamps._extend( z, tstamps[idx] ) #tstamps[idx]: is the tstamps whose corresponding index in idx is True hyper2id = dict(store['hyper2id'].values) obj2id = dict(store['source2id'].values) trace_size = sum(count_z) #sum of the number of appearances of all envs kernel = kernel_class() kernel.build(trace_size, count_z.shape[0], residency_priors) kernel.update_state(P) with open(trace_fpath) as traceFile: predictObject(store, sequenceLength, tsFile, traceFile, true_mem_size, hyper2id, obj2id, previous_stamps, Theta_zh, Psi_sz, count_z, kernel) tsFile.close() traceFile.close()
def finalize_merge(nz, to_merge, Dts, Trace, nh, ns, kernel): for z1, z2 in to_merge: idx = Trace[:, -1] == z2 Trace[:, -1][idx] = z1 if to_merge and kernel.get_priors().shape[0] > 0: new_P_dict = dict((i, row) for i, row in enumerate(kernel.get_state())) for z1, z2 in to_merge: del new_P_dict[z2] new_P = [] for i in sorted(new_P_dict): new_P.append(new_P_dict[i]) else: new_P = kernel.get_state() #Make sure new trace has contiguous ids new_assign = Trace[:, -1].copy() old_assign = Trace[:, -1].copy() if to_merge: new_nz = len(set(new_assign)) for i, z in enumerate(set(new_assign)): idx = old_assign == z new_assign[idx] = i else: new_nz = nz Trace[:, -1] = new_assign #Populate new counts Count_zh_new = np.zeros(shape=(new_nz, nh), dtype='i4') Count_sz_new = np.zeros(shape=(ns, new_nz), dtype='i4') count_z_new = np.zeros(new_nz, dtype='i4') count_h_new = np.zeros(nh, dtype='i4') _learn.fast_populate(Trace, Count_zh_new, Count_sz_new, \ count_h_new, count_z_new) new_stamps = StampLists(new_nz) for z in xrange(new_nz): idx = Trace[:, -1] == z topic_stamps = Dts[idx] new_stamps._extend(z, topic_stamps[:, -1]) return Trace, Count_zh_new, Count_sz_new, \ count_z_new, new_stamps, np.array(new_P)
def sample(Dts, Trace, Count_zh, Count_sz_local, \ count_h, count_z, alpha_zh, beta_zs, kernel, num_iter, comm): previous_encounters_s = {} for other_processor in xrange(1, comm.size): previous_encounters_s[other_processor] = np.zeros_like(Count_sz_local) stamps = StampLists(Count_zh.shape[0]) for z in xrange(Count_zh.shape[0]): idx = Trace[:, -1] == z #dts_assigned = Dts[idx:, 0].ravel().copy() #np.sort(dts_assigned) stamps._extend(z, Dts[idx][:, -1]) aux = np.zeros(Count_zh.shape[0], dtype='f8') Count_sz_pair = np.zeros_like(Count_sz_local) Count_sz_others = np.zeros_like(Count_sz_local) Count_sz_sum = np.zeros_like(Count_sz_local) Theta_zh = np.zeros_like(Count_zh, dtype='f8') Psi_sz = np.zeros_like(Count_sz_local, dtype='f8') can_pair = True for i in xrange(num_iter // CACHE_SIZE): #Sample from the local counts and encountered counts Count_sz_sum[:] = Count_sz_local + Count_sz_others count_z[:] = Count_sz_sum.sum(axis=0) em(Dts, Trace, stamps, Count_zh, Count_sz_sum, \ count_h, count_z, alpha_zh, beta_zs, aux, Theta_zh, \ Psi_sz, CACHE_SIZE, CACHE_SIZE * 2, kernel, False) #Update local counts Count_sz_local[:] = Count_sz_sum - Count_sz_others count_z[:] = Count_sz_local.sum(axis=0) #Update expected belief of other processors if can_pair: P_local = kernel.get_state() can_pair = paired_update(comm, previous_encounters_s, \ Count_sz_local, Count_sz_pair, Count_sz_others, \ P_local, np.zeros_like(P_local)) kernel.update_state(P_local)
def split(Dts, Trace, previous_stamps, Count_zh, Count_sz, \ count_h, count_z, alpha_zh, beta_zs, ll_per_z, kernel, \ perc=0.05, min_stamps=50): nz = Count_zh.shape[0] nh = Count_zh.shape[1] ns = Count_sz.shape[0] assert nz == ll_per_z.shape[0] idx_int_all = np.arange(Trace.shape[0], dtype='i4') #Initiate auxiliary matrices Count_zh_spl = np.zeros(shape=(nz + 1, nh), dtype='i4') Count_sz_spl = np.zeros(shape=(ns, nz + 1), dtype='i4') count_z_spl = np.zeros(nz + 1, dtype='i4') Count_zh_spl[:-1, :] = Count_zh Count_sz_spl[:, :-1] = Count_sz count_z_spl[:-1] = count_z ll_per_z_new = np.zeros(nz + 1, dtype='f8') ll_per_z_new[:-1] = ll_per_z new_stamps = StampLists(nz + 1) for z in xrange(nz): new_stamps._extend(z, previous_stamps._get_all(z)) splitted = Trace[:, -1].copy() shift = 0 #Do the splits per topic for z in xrange(nz): #Candidates for removal topic_stamps = np.asanyarray(previous_stamps._get_all(z)) idx = Trace[:, -1] == z assert topic_stamps.shape[0] == idx.sum() argsrt = topic_stamps.argsort() top = int(np.ceil(perc * topic_stamps.shape[0])) #If not at least min stamps, exit, not enough for a CCDF estimation if top < min_stamps: continue #Populate stamps new_stamps._clear_one(z) new_stamps._clear_one(nz) new_stamps._extend(z, topic_stamps[:-top]) new_stamps._extend(nz, topic_stamps[-top:]) #Split topic on the Trace. The trace has to be sorted by timestamp!! old_assign = Trace[:, -1][idx].copy() new_assign = Trace[:, -1][idx].copy() new_assign[-top:] = nz Trace[:, -1][idx] = new_assign #Update matrices. Can't really vectorize this :( for line in Trace[idx][-top:]: h = line[0] Count_zh_spl[z, h] -= 1 for o in line[1:-1]: Count_sz_spl[o, z] -= 1 count_z_spl[z] -= 1 Count_zh_spl[nz, h] += 1 for o in line[1:-1]: Count_sz_spl[o, nz] += 1 count_z_spl[nz] += 1 #New LL ll_per_z_new[z] = 0 ll_per_z_new[-1] = 0 idx_int = idx_int_all[idx] _eval.quality_estimate(Dts, Trace, \ new_stamps, Count_zh_spl, Count_sz_spl, count_h, \ count_z_spl, alpha_zh, beta_zs, \ ll_per_z_new, idx_int, kernel) if ll_per_z_new.sum() > ll_per_z.sum(): new_assign[-top:] = nz + shift splitted[idx] = new_assign shift += 1 #Revert trace new_stamps._clear_one(z) new_stamps._clear_one(nz) new_stamps._extend(z, previous_stamps._get_all(z)) Count_zh_spl[:-1, :] = Count_zh Count_sz_spl[:, :-1] = Count_sz count_z_spl[:-1] = count_z Count_zh_spl[-1, :] = 0 Count_sz_spl[:, -1] = 0 count_z_spl[-1] = 0 ll_per_z_new[z] = ll_per_z[z] ll_per_z_new[-1] = 0 Trace[:, -1][idx] = old_assign return finalize_splits(nz, shift, splitted, Dts, Trace, nh, ns, kernel)
def main(model, out_fpath_rrs, out_fpath_pred): store = pd.HDFStore(model) from_ = store['from_'][0][0] to = store['to'][0][0] assert from_ == 0 trace_fpath = store['trace_fpath'][0][0] kernel_class = store['kernel_class'][0][0] kernel_class = eval(kernel_class) Theta_zh = store['Theta_zh'].values Psi_sz = store['Psi_sz'].values count_z = store['count_z'].values[:, 0] P = store['P'].values residency_priors = store['residency_priors'].values[:, 0] previous_stamps = StampLists(count_z.shape[0]) mem_size = store['Dts'].values.shape[1] tstamps = store['Dts'].values[:, 0] assign = store['assign'].values[:, 0] for z in xrange(count_z.shape[0]): idx = assign == z previous_stamps._extend(z, tstamps[idx]) hyper2id = dict(store['hyper2id'].values) obj2id = dict(store['source2id'].values) HSDs = [] Dts = [] with open(trace_fpath) as trace_file: for i, l in enumerate(trace_file): if i < to: continue spl = l.strip().split('\t') dts_line = [float(x) for x in spl[:mem_size]] h = spl[mem_size] d = spl[-1] sources = spl[mem_size + 1:-1] all_in = h in hyper2id and d in obj2id for s in sources: all_in = all_in and s in obj2id if all_in: trace_line = [hyper2id[h]] + [obj2id[s] for s in sources] + \ [obj2id[d]] HSDs.append(trace_line) Dts.append(dts_line) trace_size = sum(count_z) kernel = kernel_class() kernel.build(trace_size, count_z.shape[0], residency_priors) kernel.update_state(P) num_queries = min(10000, len(HSDs)) queries = np.random.choice(len(HSDs), size=num_queries) HSDs = np.array(HSDs, dtype='i4')[queries].copy() Dts = np.array(Dts, dtype='d')[queries].copy() rrs, preds = _eval.reciprocal_rank(Dts, \ HSDs, previous_stamps, Theta_zh, Psi_sz, count_z, kernel, True) np.savetxt(out_fpath_rrs, rrs) np.savetxt(out_fpath_pred, preds) print((1.0 / rrs).mean(axis=0)) store.close()
def initialize_trace(trace_fpath, num_topics, num_iter, \ from_=0, to=np.inf, initial_assign=None): ''' Given a trace (user trajectories) to learn from, this method will initialize the necessary matrices and dicts to learn tribeflow. Using from_, to_ the trace can be sliced. initial_assign is useful to pickup learning from a previous model. Parameters ---------- trace_fpath : string The location of the trace num_topics : int The number of latent spaces from_ : int Where to begin reading the trace from. 0 is the first line. to : int We will stop reading the file here initial_assign : array-like Initial topic assignments. Returns ------- Count matrices and dicts used to learn tribeflow. ''' count_zh_dict = defaultdict(int) count_oz_dict = defaultdict(int) count_z_dict = defaultdict(int) count_h_dict = defaultdict(int) hyper2id = OrderedDict() obj2id = OrderedDict() if initial_assign: initial_assign = np.asarray(initial_assign, dtype='i') assert initial_assign.min() >= 0 assert initial_assign.max() < num_topics Dts = [] Trace = [] with open(trace_fpath, 'r') as trace_file: for i, line in enumerate(trace_file): if i < from_: continue if i >= to: break spl = line.strip().split('\t') assert len(spl) >= 4 assert (len(spl) - 2) % 2 == 0 mem_size = (len(spl) - 2) // 2 line_dts = [] for j in xrange(mem_size): line_dts.append(float(spl[j])) Dts.append(line_dts) hyper_str = spl[mem_size] if hyper_str not in hyper2id: hyper2id[hyper_str] = len(hyper2id) if not initial_assign: z = np.random.randint(num_topics) else: z = initial_assign[i] h = hyper2id[hyper_str] count_zh_dict[z, h] += 1 count_h_dict[h] += 1 line_int = [h] for j in xrange(mem_size + 1, len(spl)): obj_str = spl[j] if obj_str not in obj2id: obj2id[obj_str] = len(obj2id) o = obj2id[obj_str] line_int.append(o) count_oz_dict[o, z] += 1 count_z_dict[z] += 1 line_int.append(z) Trace.append(line_int) #Sort by the last residency time. Dts = np.asarray(Dts) argsort = Dts[:, -1].argsort() assert Dts.shape[1] == mem_size #Create contiguous arrays, not needed but adds a small speedup Dts = np.asanyarray(Dts[argsort], order='C') Trace = np.asarray(Trace) Trace = np.asanyarray(Trace[argsort], dtype='i4', order='C') nh = len(hyper2id) no = len(obj2id) nz = num_topics previous_stamps = StampLists(num_topics) for z in xrange(nz): idx = Trace[:, -1] == z topic_stamps = Dts[:, -1][idx] previous_stamps._extend(z, topic_stamps) Count_zh = np.zeros(shape=(nz, nh), dtype='i4') Count_oz = np.zeros(shape=(no, nz), dtype='i4') count_h = np.zeros(shape=(nh, ), dtype='i4') count_z = np.zeros(shape=(nz, ), dtype='i4') for z in xrange(Count_zh.shape[0]): count_z[z] = count_z_dict[z] for h in xrange(Count_zh.shape[1]): count_h[h] = count_h_dict[h] Count_zh[z, h] = count_zh_dict[z, h] for o in xrange(Count_oz.shape[0]): Count_oz[o, z] = count_oz_dict[o, z] assert (Count_oz.sum(axis=0) == count_z).all() prob_topics_aux = np.zeros(nz, dtype='f8') Theta_zh = np.zeros(shape=(nz, nh), dtype='f8') Psi_oz = np.zeros(shape=(no, nz), dtype='f8') return Dts, Trace, previous_stamps, Count_zh, Count_oz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_oz, \ hyper2id, obj2id
def main(model, out_fpath_rrs, out_fpath_pred): store = pd.HDFStore(model) from_ = store['from_'][0][0] to = store['to'][0][0] assert from_ == 0 trace_fpath = store['trace_fpath'][0][0] kernel_class = store['kernel_class'][0][0] kernel_class = eval(kernel_class) Theta_zh = store['Theta_zh'].values Psi_sz = store['Psi_sz'].values count_z = store['count_z'].values[:, 0] P = store['P'].values residency_priors = store['residency_priors'].values[:, 0] previous_stamps = StampLists( count_z.shape[0]) #previous_stamps has a length = nz #HERE mem_size = store['Dts'].values.shape[1] tstamps = store[ 'Dts'].values[:, 0] #tstamps (#trainingLines,) contains the t(xi)-t(xi-1) (i.e. the col in the dts) assign = store[ 'assign'].values[:, 0] # assign (#trainingLines,) each dim has the env id used in that training instance for z in xrange(count_z.shape[0]): idx = assign == z previous_stamps._extend( z, tstamps[idx] ) #tstamps[idx]: is the tstamps whose corresponding index in idx is True hyper2id = dict(store['hyper2id'].values) obj2id = dict(store['source2id'].values) HSDs = [] Dts = [] with open(trace_fpath) as trace_file: for i, l in enumerate(trace_file): if i < to: continue #skipping the traininglines to reach to testSet spl = l.strip().split('\t') dts_line = [float(x) for x in spl[:mem_size]] h = spl[mem_size] d = spl[-1] sources = spl[mem_size + 1:-1] all_in = h in hyper2id and d in obj2id for s in sources: all_in = all_in and s in obj2id if all_in: trace_line = [hyper2id[h]] + [obj2id[s] for s in sources] + [obj2id[d]] HSDs.append(trace_line) Dts.append(dts_line) trace_size = sum(count_z) #sum of the number of appearances of all envs kernel = kernel_class() kernel.build(trace_size, count_z.shape[0], residency_priors) kernel.update_state(P) num_queries = min(10000, len(HSDs)) queries = np.random.choice(len(HSDs), size=num_queries) HSDs = np.array(HSDs, dtype='i4')[queries].copy() Dts = np.array(Dts, dtype='d')[queries].copy() rrs, preds = _eval.reciprocal_rank(Dts, HSDs, previous_stamps, Theta_zh, Psi_sz, count_z, kernel, True) np.savetxt(out_fpath_rrs, rrs) np.savetxt(out_fpath_pred, preds) print(rrs.mean(axis=0)) store.close()
def initialize_trace(trace_fpath, num_topics, num_iter, \ from_=0, to=np.inf, initial_assign=None): ''' Given a trace (user trajectories) to learn from, this method will initialize the necessary matrices and dicts to learn tribeflow. Using from_, to_ the trace can be sliced. initial_assign is useful to pickup learning from a previous model. Parameters ---------- trace_fpath : string The location of the trace num_topics : int The number of latent spaces from_ : int Where to begin reading the trace from. 0 is the first line. to : int We will stop reading the file here initial_assign : array-like Initial topic assignments. Returns ------- Count matrices and dicts used to learn tribeflow. ''' count_zh_dict = defaultdict(int) count_oz_dict = defaultdict(int) count_z_dict = defaultdict(int) count_h_dict = defaultdict(int) hyper2id = OrderedDict() obj2id = OrderedDict() if initial_assign: initial_assign = np.asarray(initial_assign, dtype='i') assert initial_assign.min() >= 0 assert initial_assign.max() < num_topics Dts = [] Trace = [] with open(trace_fpath, 'r') as trace_file: for i, line in enumerate(trace_file): if i < from_: continue if i >= to: break spl = line.strip().split('\t') assert len(spl) >= 4 assert (len(spl) - 2) % 2 == 0 mem_size = (len(spl) - 2) // 2 line_dts = [] for j in xrange(mem_size): line_dts.append(float(spl[j])) Dts.append(line_dts) hyper_str = spl[mem_size] if hyper_str not in hyper2id: hyper2id[hyper_str] = len(hyper2id) if not initial_assign: z = np.random.randint(num_topics) else: z = initial_assign[i] h = hyper2id[hyper_str] count_zh_dict[z, h] += 1 count_h_dict[h] += 1 line_int = [h] for j in xrange(mem_size + 1, len(spl)): obj_str = spl[j] if obj_str not in obj2id: obj2id[obj_str] = len(obj2id) o = obj2id[obj_str] line_int.append(o) count_oz_dict[o, z] += 1 count_z_dict[z] += 1 line_int.append(z) Trace.append(line_int) #Sort by the last residency time. Dts = np.asarray(Dts) argsort = Dts[:, -1].argsort() assert Dts.shape[1] == mem_size #Create contiguous arrays, not needed but adds a small speedup Dts = np.asanyarray(Dts[argsort], order='C') Trace = np.asarray(Trace) Trace = np.asanyarray(Trace[argsort], dtype='i4', order='C') nh = len(hyper2id) no = len(obj2id) nz = num_topics previous_stamps = StampLists(num_topics) for z in xrange(nz): idx = Trace[:, -1] == z topic_stamps = Dts[:, -1][idx] previous_stamps._extend(z, topic_stamps) Count_zh = np.zeros(shape=(nz, nh), dtype='i4') Count_oz = np.zeros(shape=(no, nz), dtype='i4') count_h = np.zeros(shape=(nh,), dtype='i4') count_z = np.zeros(shape=(nz,), dtype='i4') for z in xrange(Count_zh.shape[0]): count_z[z] = count_z_dict[z] for h in xrange(Count_zh.shape[1]): count_h[h] = count_h_dict[h] Count_zh[z, h] = count_zh_dict[z, h] for o in xrange(Count_oz.shape[0]): Count_oz[o, z] = count_oz_dict[o, z] assert (Count_oz.sum(axis=0) == count_z).all() prob_topics_aux = np.zeros(nz, dtype='f8') Theta_zh = np.zeros(shape=(nz, nh), dtype='f8') Psi_oz = np.zeros(shape=(no, nz), dtype='f8') return Dts, Trace, previous_stamps, Count_zh, Count_oz, \ count_h, count_z, prob_topics_aux, Theta_zh, Psi_oz, \ hyper2id, obj2id