def recombination(s1, s2): """ Recombine genes of two solutions. If s2 is None, simply return s1 Args: ---------- s1 (numpy.array) : 1d array representing a solution. s2 (numpy.array) : 1d array representing a solution. Returns: ---------- tuple of np.array: recombined solutions. """ # create children c1, c2 = np.full_like(s1, fill_value=-1), np.full_like(s1, fill_value=-1) # positions where both are valid r = np.where(np.logical_and(s1 != -1, s2 != -1))[0] # positions where exactly one is valid q = np.where(np.logical_xor(s1 != -1, s2 != -1))[0] q_sol = s1[q] + s2[q] + 1 # valus at position q different from -1 # find best solution among r r_size = r.shape[0] best_coef, best_sol = np.inf, None if r_size: for sol in cartesian(np.vstack([s1[r], s2[r]]).T): c1[r] = sol coef = self.sparcity_coefficient(c1, r_size) if coef < best_coef: best_coef, best_sol = coef, sol c1[r] = best_sol # greedily find best solution among q dimensions_inserted = r_size taken_q = np.zeros_like(q) while dimensions_inserted != self.dimensionality: dimensions_inserted += 1 best_coef, best_index, current_best_index = np.inf, None, None for j, (taken, q_index) in enumerate(zip(taken_q, q)): if not taken: c1[q_index] = q_sol[j] # print(c1, dimensions_inserted) coef = self.sparcity_coefficient( c1, dimensions_inserted) if coef < best_coef: best_coef, best_index, current_best_index = coef, q_index, j c1[q_index] = -1 c1[best_index] = q_sol[current_best_index] taken_q[current_best_index] = 1 # make c2 complemtary to c1 crossover_positions = np.concatenate([r, q]) c2[crossover_positions] = s1[crossover_positions] + s2[ crossover_positions] - c1[crossover_positions] return c1, c2
def roller(thetas, rhos, radius): # Clipper works only with integers, scaling needed p = cartesian(thetas, rhos) scale = 1 / (rhos.max() * (1 - np.cos(np.pi / thetas.shape[0]))) # very good p *= scale coords = p.astype(int) pco = pyclipper.PyclipperOffset() pco.AddPath(coords, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) result = pco.Execute(-radius * scale)[0] p = polar(*zip(*result)) p[1] /= scale return p
def __getattr__(self, name): if name == 'theta': return self.pcoords[0] elif name == 'rho': return self.pcoords[1] elif name == 'ppoints': return self.pcoords.T elif name == 'points': if self._points is None: self._points = cartesian(*self.pcoords) return self._points elif name == 'coords': return self.points.T return None
def init_neg_samples(): with open(TEST_NEG_SAMPLES, 'rb') as fp: data = json.loads(fp.read()) new_session_num = 0 new_sample_num = 0 # need to normalize trip to start at 0,0 x0 = data['features'][0]['geometry']['coordinates'][1] y0 = data['features'][0]['geometry']['coordinates'][0] (x0, y0) = cartesian(x0, y0)[:2] for feature in data['features']: (x, y) = cartesian(feature['geometry']['coordinates'][1], feature['geometry']['coordinates'][0])[:2] new_log = Logs(driverID=2, sessionNum=new_session_num, sampleNum=new_sample_num, time=feature['properties']['time'], timeLong=feature['properties']['time_long'], xCoord=x - x0, yCoord=y - y0) db.session.add(new_log) new_sample_num += 1 db.session.commit()
def stl(cam, filename, width, conj=False): if conj: points = cartesian(*cam.conj_pcoords) else: points = cam.points # 2D Delaunay triangulation for front face and building of lower and upper faces tri0 = points[Delaunay(points).simplices] tri1 = np.concatenate((tri0, np.zeros([tri0.shape[0], tri0.shape[1], 1])), 2) tri2 = np.concatenate( (tri0, np.ones([tri0.shape[0], tri0.shape[1], 1]) * width), 2) # Build triangles for side face vertices1 = np.concatenate((points, np.zeros([points.shape[0], 1])), 1) vertices2 = np.concatenate((points, np.ones([points.shape[0], 1]) * width), 1) tri3_1 = np.empty([points.shape[0], 3, 3]) tri3_2 = np.empty_like(tri3_1) for i in range(0, vertices1.shape[0]): tri3_1[i] = [vertices1[i - 1], vertices1[i], vertices2[i]] tri3_2[i] = [vertices2[i - 1], vertices2[i], vertices1[i - 1]] # Concatenate and save tri = np.concatenate((tri1, tri2, tri3_1, tri3_2)) data = np.zeros(tri.shape[0], dtype=mesh.Mesh.dtype) data['vectors'] = tri prism = mesh.Mesh(data) prism.save(filename) # Ugly plotting fig = plt.figure() ax = mplot3d.Axes3D(fig) ax.add_collection3d(mplot3d.art3d.Poly3DCollection(prism.vectors)) # Auto scale scale = prism.points.flatten(-1) ax.auto_scale_xyz(scale, scale, scale) plt.show()
def get_experiments(cfg): """Creates a list of experiments.""" # TODO: sampling exps = utils.cartesian(cfg.optspace) if hasattr(cfg, 'constraints'): for c in cfg.constraints: exps_tmp = [] for exp in exps: if c(exp): exps_tmp.append(exp) exps = exps_tmp if hasattr(cfg, 'optpt_cmp'): if sys.version_info < (3, 2): exps.sort(cmp=cfg.optpt_cmp) else: # starting from Python3.2 the `cmp` parameter is removed # functools.cmp_to_key is introduced since Python2.7 and Python3.2 import functools exps.sort(key=functools.cmp_to_key(cfg.optpt_cmp)) return exps
def rot_coords(self, theta0): return cartesian(self.theta + theta0, self.rho).T
def fit(self, W, C, vocab, AD, author_list, timestamp_list, verbose=True): # {{{ run gibbs sampling import sys import numpy as np from utils import cartesian from scipy.stats import poisson ''' W[:,0] -> word index W[:,1] -> document index W[:,2] -> timestamp index C: document-citation sparse matrix C[i, t] -> citation count for the ith document at timestamp t AD: author-document sparse matrix ''' # number of authors A = author_list.size # number of unique tokens V = vocab.size # total number of tokens nnz = W.shape[0] # number of timestamps T = timestamp_list.size # save meta-info to this object self.vocabulary = vocab self.authors = author_list self.timestamps = timestamp_list # init to one above the max val z_states = np.zeros((self.n_iter+1, nnz), dtype=np.uint32) + self.K a_states = np.zeros((self.n_iter+1, nnz), dtype=np.uint32) + A t_states = np.zeros((self.n_iter+1, nnz), dtype=np.uint32) + T # create all needed sequences k_range = np.arange(self.K) t_range = np.arange(T) a_range = np.arange(A) v_range = np.arange(V) # 1.1 initialize topic assignment randomly z_states[0, :] = np.random.choice(self.K, nnz, True) # 1.2 initialize author and timestamp assignment for i in np.arange(nnz): t = W[i, 2] di = W[i, 1] ad = AD[:, di].nonzero()[0] a_states[0, i] = np.random.choice(ad) t_states[0, i] = np.random.choice(np.arange(t, T)) # 2. initialize lambda matrix lambda_ = np.zeros((self.K, T), dtype=np.uint32) for k in k_range: k_indices = np.where(z_states[0, :] == k)[0] for t in np.arange(T): t_indices = np.where(t_states[0, :] == t)[0] kt_indices = np.intersect1d(t_indices, k_indices) d_indices = W[kt_indices, 1] lambda_[k, t] = C[d_indices, t].mean() # zeros set to overall mean lam_x, lam_y = np.where(lambda_ == 0) if lam_x.size: lambda_[lam_x, lam_y] = C.mean()/float((lam_x.size)) # 3. sample # {{{ for iter_ in np.arange(1, self.n_iter+1): if verbose: print 'Iter %i...... (Total %i)' %(iter_, self.n_iter) sys.stdout.flush() else: if iter_ % 100 : print 'Iter %i...... (Total %i)' %(iter_, self.n_iter) sys.stdout.flush() for i in np.arange(nnz): #{{{ sample each token sequentially # {{{ denominators den_author = np.zeros(A, dtype=np.float_) for a in a_range: for k in k_range: # words that are assigned to topic k, excluding the current one k_indices = np.append(np.where(z_states[iter_-1, i+1:]==k)[0] + (i+1),\ np.where(z_states[iter_, :i]==k)[0]) n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \ (a_states[iter_, k_indices[k_indices < i]] == a).sum() den_author[a] += n_a_k_i den_author[a] += self.K*self.alpha den_timestamp = np.zeros(self.K, dtype=np.float_) den_token = np.zeros(self.K, dtype=np.float_) for k in k_range: for t in t_range: # words that are assigned to timestamp t, excluding the current one t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\ np.where(t_states[iter_, :i]==t)[0]) n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \ (z_states[iter_, t_indices[t_indices < i]] == k).sum() den_timestamp[k] += n_k_t_i den_timestamp[k] += T*self.pi for v in v_range: # words that are tokens v, excluding the current one v_indices = np.append(np.where(W[i+1:, 0] == v)[0] + (i+1), np.where(W[:i, 0] == v)[0]) n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \ (z_states[iter_, v_indices[v_indices < i]] == k).sum() den_token[k] += n_k_v_i den_token[k] += V*self.beta # }}} v = W[i, 0] t = W[i, 2] di = W[i, 1] ci = C[di, :] # find its authors ad = AD[:,di].nonzero()[0] comb_list = cartesian((np.arange(t, T), k_range, ad)) comb_p_list = np.zeros(comb_list.shape[0], dtype=np.float_) # excluding the current one v_indices = np.append(np.where(W[i+1:, 0] == v)[0] + (i+1), np.where(W[:i, 0] == v)[0]) # {{{ for each combination, obtain full conditional probability for comb_index in np.arange(comb_p_list.size): comb = comb_list[comb_index] t, k, a = comb # 1 t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\ np.where(t_states[iter_, :i]==t)[0]) n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \ (z_states[iter_, t_indices[t_indices < i]] == k).sum() p1 = (n_k_t_i + self.pi)/den_timestamp[k] # 2 n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \ (z_states[iter_, v_indices[v_indices < i]] == k).sum() p2 = (n_k_v_i + self.beta)/den_token[k] # 3 # excluding the current one k_indices = np.append(np.where(z_states[iter_-1, i+1:] == k)[0] + (i+1),\ np.where(z_states[iter_, :i] == k)[0]) n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \ (a_states[iter_, k_indices[k_indices < i]] == a).sum() p3 = (n_a_k_i + self.alpha)/den_author[a] # poisson pmf p4 = poisson.pmf(ci[t], mu=lambda_[k,t]) #print p1, p2, p3, p4, lambda_[k,t], ci[t] comb_p_list[comb_index] = p1*p2*p3*p4 # }}} # rescale to [0,1] comb_p_list = comb_p_list/comb_p_list.sum() # sample for i-th word comb_index = np.random.choice(np.arange(comb_p_list.size), p=comb_p_list) t, k, a = comb_list[comb_index] t_states[iter_, i] = t z_states[iter_, i] = k a_states[iter_, i] = a #}}} END for i-th TOKEN # update lambda after each iteration for k in k_range: k_indices = np.where(z_states[iter_, :] == k)[0] for t in t_range: t_indices = np.where(t_states[iter_, :] == t)[0] kt_indices = np.intersect1d(k_indices, t_indices) d_indices = W[kt_indices, 1] # if no word is assigned to topic k and timestamp t, keep it as before if d_indices.size > 0: lambda_[k, t] = C[d_indices, t].mean() # }}} # 4. obtain \theta, \phi, and \psi # burn-in: first half z_samples = z_states[1:, :][self.n_iter/2:, :] a_samples = a_states[1:, :][self.n_iter/2:, :] t_samples = t_states[1:, :][self.n_iter/2:, :] # author-topic theta = np.zeros((A, self.K), dtype=np.float_) # topic-word phi = np.zeros((self.K, V), dtype=np.float_) # topic-timestamp psi = np.zeros((self.K, T), dtype=np.float_) for a in a_range: den = self.K * self.alpha + (a_samples==a).sum() a_x, a_y = np.where(a_samples==a) for k in k_range: n_a_k = (z_samples[a_x, a_y] == k).sum() theta[a,k] = float(n_a_k+self.alpha) / (den) for k in k_range: k_count = (z_samples==k).sum() den_v = V * self.beta + k_count den_t = T * self.pi + k_count # x is iteration number, y is word index k_x, k_y = np.where(z_samples==k) for v in v_range: n_k_v = (W[k_y, 0]==v).sum() phi[k,v] = float(n_k_v+self.beta) / den_v for t in t_range: n_k_t = (t_samples[k_x, k_y]==t).sum() psi[k,t] = float(n_k_t+self.pi) / den_t # update lambda t_x, t_y = np.where(t_samples == t) kt_indices = np.intersect1d(k_y, t_y) d_indices = W[kt_indices, 1] # if no word is assigned to topic k and timestamp t, keep it as before if d_indices.size > 0: lambda_[k, t] = C[d_indices, t].mean() self.theta = theta self.phi = phi self.psi = psi self.lambda_ = lambda_ self.z_samples = z_samples self.a_samples = a_samples self.t_samples = t_samples # }}} return theta, phi, psi, lambda_
def post_prediction(tma_id): tma_exists = db.session.query(TMAs.tmaID).filter_by(tmaID=tma_id).scalar() if tma_exists is None: return jsonify({ "status": "fail", "data": { "tma_id": "%s is not registered" % tma_id } }), 400 driver_id = tmaID_to_driverID(tma_id) if driver_id is None: return jsonify({ "status": "fail", "data": { "tma_id": "%s does not correspond to a driver according to dispatch" % tma_id } }), 404 if 'log' not in request.files: return jsonify({ "status": "fail", "data": { "log": "no log file attached" } }), 400 else: file = request.files['log'] data = json.loads(file.read()) try: new_session_num = db.session.query(func.max( Logs.sessionNum)).filter_by(driverID=driver_id).scalar() + 1 except TypeError: new_session_num = 0 new_sample_num = 0 start_time = data['features'][0]['properties']['time'] # need to normalize trip to start at 0,0 x0 = data['features'][0]['geometry']['coordinates'][1] y0 = data['features'][0]['geometry']['coordinates'][0] (x0, y0) = utils.cartesian(x0, y0)[:2] for feature in data['features']: (x, y) = utils.cartesian(feature['geometry']['coordinates'][1], feature['geometry']['coordinates'][0])[:2] new_log = Logs(driverID=driver_id, sessionNum=new_session_num, sampleNum=new_sample_num, time=feature['properties']['time'], timeLong=feature['properties']['time_long'], xCoord=x - x0, yCoord=y - y0) db.session.add(new_log) new_sample_num += 1 db.session.commit() # temp prediction p = multiprocessing.Process(target=make_prediction_async, args=( driver_id, new_session_num, start_time, )) #p = multiprocessing.Process(target=make_false_prediction_async, args=(driver_id, new_session_num, start_time,)) p.start() return jsonify({"status": "success", "data": None}), 200
# for j in range(0, x.shape[1]): # if a[i, j] == 1: # for t1 in range(0, 2): # p = utils.conditional_probability( # x, j, np.equal, 1, i, np.equal, t1) # print 'P({:s}=1|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * p) # print 'P({:s}=0|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * (1 - p)) # print '\n' c = [] for j in [2]: # observando apenas os pais de E for i in range(0, x.shape[1]): if a[i, j] == 1: c.append([0, 1]) # para cada pai de E, adiciono um vetor em c. comb = utils.cartesian(c) for i in range(0, len(comb)): ind = np.logical_and( np.equal(x[:, 0], comb[i, 0]), np.equal(x[:, 1], comb[i, 1])) ind = np.logical_and(ind, np.equal(x[:, 3], comb[i, 2])) ind = np.logical_and(ind, np.equal(x[:, 4], comb[i, 3])) xl = x[ind, :] p = utils.simple_probability(xl, 2, np.equal, 1) if p == -1: print 'P(C={:d},F={:d},M={:d},A={:d})=0%\n'.format(comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3]) else: print 'P(E=1|C={:d},F={:d},M={:d},A={:d})={:3.2f}%'.format(comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3], 100.0 * p) print 'P(E=0|C={:d},F={:d},M={:d},A={:d})={:3.2f}%\n'.format(comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3], 100.0 * (1 - p))
break return topic_lists # Build the Twitter queries print 'Building and grouping Twitter queries...' names = [] queries = [] for p1, p2 in pairwise(parties): party_list = [p1, p2] topic_lists = group_topics(party_list, topics) for topic_list in topic_lists: query_total = '' couples = cartesian([party_list, topic_list]) for party, topic in couples: query = '((' + party + ' OR #' + party + ') (' + topic + ' OR #' + topic + '))' if not query_total: query_total = query else: query_total = query_total + ' OR ' + query names.append('#'.join(party_list + topic_list)) queries.append(query_total) # Launch the scraping processes print 'Preparing to launch...' commands = '' for i, query in enumerate(queries):
# for j in range(0, x.shape[1]): # if a[i, j] == 1: # for t1 in range(0, 2): # p = utils.conditional_probability( # x, j, np.equal, 1, i, np.equal, t1) # print 'P({:s}=1|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * p) # print 'P({:s}=0|{:s}={:d})={:3.2f}%'.format(att[j], att[i], t1, 100.0 * (1 - p)) # print '\n' c = [] for j in [2]: # observando apenas os pais de E for i in range(0, x.shape[1]): if a[i, j] == 1: c.append([0, 1]) # para cada pai de E, adiciono um vetor em c. comb = utils.cartesian(c) for i in range(0, len(comb)): ind = np.logical_and(np.equal(x[:, 0], comb[i, 0]), np.equal(x[:, 1], comb[i, 1])) ind = np.logical_and(ind, np.equal(x[:, 3], comb[i, 2])) ind = np.logical_and(ind, np.equal(x[:, 4], comb[i, 3])) xl = x[ind, :] p = utils.simple_probability(xl, 2, np.equal, 1) if p == -1: print 'P(C={:d},F={:d},M={:d},A={:d})=0%\n'.format( comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3]) else: print 'P(E=1|C={:d},F={:d},M={:d},A={:d})={:3.2f}%'.format( comb[i, 0], comb[i, 1], comb[i, 2], comb[i, 3], 100.0 * p) print 'P(E=0|C={:d},F={:d},M={:d},A={:d})={:3.2f}%\n'.format(
def task(tree_file): if os.path.exists("assets/processed/stops_aligned/{}".format( tree_file.split("/")[-1])): print("passed", tree_file) return try: tree = np.load(tree_file, allow_pickle=True)["arr_0"].item() except: print(tree_file) stop_tree = {} for route_id in tqdm(tree): stops = routes_data[route_id] stop_tree[route_id] = {} directions = [] for e in range(1, len(stops)): directions.append( cartesian(*stops_data[stops[e]][:2]) - cartesian(*stops_data[stops[e - 1]][:2])) for each_trip in tree[route_id]: stop_tree[route_id][each_trip] = [None] * len(stops) for start_stop in range(0, len(stops)): if tree[route_id][each_trip][start_stop] == None: continue trip_stop_data = np.array( tree[route_id][each_trip][start_stop]) if (len(trip_stop_data) > 1 and (np.diff([e[0] for e in trip_stop_data]) < 0).any()): if (np.count_nonzero( np.diff([e[0] for e in trip_stop_data]) < 0) > 1): continue else: trip_stop_data = sorted(trip_stop_data, key=lambda e: e[0]) _, un_repeat_stops = np.unique([e[0] for e in trip_stop_data], return_index=True) trip_stop_data = np.array(trip_stop_data)[un_repeat_stops] assert (np.diff([e[0] for e in trip_stop_data]) > 0).all() distances = np.array([ haversine_dist(*e[2:], *stops_data[stops[start_stop]][:2]) for e in trip_stop_data ]) time = np.array([e[0] for e in trip_stop_data]) close_time = np.argmin(distances) close_time_val = time[close_time] time -= time[close_time] max_range = np.zeros(len(time), dtype=bool) max_range[-15 + close_time:close_time] = True max_range[close_time:close_time + 15] = True useful_indices = np.logical_and( np.logical_and(time < 5 * 60, time > -5 * 60), max_range) time = time[useful_indices] distances = distances[useful_indices] trip_stop_data = trip_stop_data[useful_indices] if start_stop == 0: prev_dir = -1 * directions[0] next_dir = directions[0] elif start_stop == len(stops) - 1: prev_dir = -1 * directions[len(stops) - 2] next_dir = directions[len(stops) - 2] else: prev_dir = -1 * directions[start_stop - 1] next_dir = directions[start_stop] prev_dir = np.array([ get_angle( prev_dir, cartesian(*e[2:]) - cartesian(*stops_data[stops[start_stop]][:2]), ) for e in trip_stop_data ]) next_dir = np.array([ get_angle( next_dir, cartesian(*e[2:]) - cartesian(*stops_data[stops[start_stop]][:2]), ) for e in trip_stop_data ]) backward = prev_dir > next_dir displacement = distances * (-1 * (backward - 0.5) * 2) useful_indices = longest_subsequence(displacement, return_index=True) displacement = displacement[useful_indices] time = time[useful_indices] trip_stop_data = trip_stop_data[useful_indices] if (len(displacement) > 1 and -32 > displacement[0] and -32 < displacement[-1]): stop_tree[route_id][each_trip][start_stop] = int( close_time_val + interpolate.interp1d( displacement, time, fill_value="extrapolate")(-32)) elif len(displacement) > 1: dist_diff = np.diff(displacement) drequired = (displacement[0] + 32 if displacement[0] > -32 else displacement[-1] + 32) velocity_ind = np.argmin(np.abs(dist_diff - drequired)) velocity = dist_diff[velocity_ind] / ( time[velocity_ind + 1] - time[velocity_ind]) trequired = (time[0] - drequired / velocity if displacement[0] > -32 else time[-1] + drequired / velocity) stop_tree[route_id][each_trip][start_stop] = int( close_time_val + trequired) else: assert len(displacement) == 1 speed = trip_stop_data[0][1] * 3.6 drequired = displacement[0] + 32 if speed == 0: speed = 2.7 trequired = (time[0] - drequired / speed if displacement[0] > -32 else time[-1] + drequired / speed) stop_tree[route_id][each_trip][start_stop] = int( close_time_val + trequired) np.savez_compressed( "assets/processed/stops_aligned/{}".format(tree_file.split("/")[-1]), stop_tree, )
def fit(self, W, C, vocab, AD, author_list, timestamp_list, verbose=True): # {{{ run gibbs sampling import sys import numpy as np from utils import cartesian from scipy.stats import poisson ''' W[:,0] -> word index W[:,1] -> document index W[:,2] -> timestamp index C: document-citation sparse matrix C[i, t] -> citation count for the ith document at timestamp t AD: author-document sparse matrix ''' # number of authors A = author_list.size # number of unique tokens V = vocab.size # total number of tokens nnz = W.shape[0] # number of timestamps T = timestamp_list.size # save meta-info to this object self.vocabulary = vocab self.authors = author_list self.timestamps = timestamp_list # init to one above the max val z_states = np.zeros((self.n_iter + 1, nnz), dtype=np.uint32) + self.K a_states = np.zeros((self.n_iter + 1, nnz), dtype=np.uint32) + A t_states = np.zeros((self.n_iter + 1, nnz), dtype=np.uint32) + T # create all needed sequences k_range = np.arange(self.K) t_range = np.arange(T) a_range = np.arange(A) v_range = np.arange(V) # 1.1 initialize topic assignment randomly z_states[0, :] = np.random.choice(self.K, nnz, True) # 1.2 initialize author and timestamp assignment for i in np.arange(nnz): t = W[i, 2] di = W[i, 1] ad = AD[:, di].nonzero()[0] a_states[0, i] = np.random.choice(ad) t_states[0, i] = np.random.choice(np.arange(t, T)) # 2. initialize lambda matrix lambda_ = np.zeros((self.K, T), dtype=np.uint32) for k in k_range: k_indices = np.where(z_states[0, :] == k)[0] for t in np.arange(T): t_indices = np.where(t_states[0, :] == t)[0] kt_indices = np.intersect1d(t_indices, k_indices) d_indices = W[kt_indices, 1] lambda_[k, t] = C[d_indices, t].mean() # zeros set to overall mean lam_x, lam_y = np.where(lambda_ == 0) if lam_x.size: lambda_[lam_x, lam_y] = C.mean() / float((lam_x.size)) # 3. sample # {{{ for iter_ in np.arange(1, self.n_iter + 1): if verbose: print 'Iter %i...... (Total %i)' % (iter_, self.n_iter) sys.stdout.flush() else: if iter_ % 100: print 'Iter %i...... (Total %i)' % (iter_, self.n_iter) sys.stdout.flush() for i in np.arange(nnz): #{{{ sample each token sequentially # {{{ denominators den_author = np.zeros(A, dtype=np.float_) for a in a_range: for k in k_range: # words that are assigned to topic k, excluding the current one k_indices = np.append(np.where(z_states[iter_-1, i+1:]==k)[0] + (i+1),\ np.where(z_states[iter_, :i]==k)[0]) n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \ (a_states[iter_, k_indices[k_indices < i]] == a).sum() den_author[a] += n_a_k_i den_author[a] += self.K * self.alpha den_timestamp = np.zeros(self.K, dtype=np.float_) den_token = np.zeros(self.K, dtype=np.float_) for k in k_range: for t in t_range: # words that are assigned to timestamp t, excluding the current one t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\ np.where(t_states[iter_, :i]==t)[0]) n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \ (z_states[iter_, t_indices[t_indices < i]] == k).sum() den_timestamp[k] += n_k_t_i den_timestamp[k] += T * self.pi for v in v_range: # words that are tokens v, excluding the current one v_indices = np.append( np.where(W[i + 1:, 0] == v)[0] + (i + 1), np.where(W[:i, 0] == v)[0]) n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \ (z_states[iter_, v_indices[v_indices < i]] == k).sum() den_token[k] += n_k_v_i den_token[k] += V * self.beta # }}} v = W[i, 0] t = W[i, 2] di = W[i, 1] ci = C[di, :] # find its authors ad = AD[:, di].nonzero()[0] comb_list = cartesian((np.arange(t, T), k_range, ad)) comb_p_list = np.zeros(comb_list.shape[0], dtype=np.float_) # excluding the current one v_indices = np.append( np.where(W[i + 1:, 0] == v)[0] + (i + 1), np.where(W[:i, 0] == v)[0]) # {{{ for each combination, obtain full conditional probability for comb_index in np.arange(comb_p_list.size): comb = comb_list[comb_index] t, k, a = comb # 1 t_indices = np.append(np.where(t_states[iter_-1, i+1:]==t)[0] + (i+1),\ np.where(t_states[iter_, :i]==t)[0]) n_k_t_i = (z_states[iter_-1, t_indices[t_indices > i]] == k).sum() + \ (z_states[iter_, t_indices[t_indices < i]] == k).sum() p1 = (n_k_t_i + self.pi) / den_timestamp[k] # 2 n_k_v_i = (z_states[iter_-1, v_indices[v_indices > i]] == k).sum() + \ (z_states[iter_, v_indices[v_indices < i]] == k).sum() p2 = (n_k_v_i + self.beta) / den_token[k] # 3 # excluding the current one k_indices = np.append(np.where(z_states[iter_-1, i+1:] == k)[0] + (i+1),\ np.where(z_states[iter_, :i] == k)[0]) n_a_k_i = (a_states[iter_-1, k_indices[k_indices > i]] == a).sum() + \ (a_states[iter_, k_indices[k_indices < i]] == a).sum() p3 = (n_a_k_i + self.alpha) / den_author[a] # poisson pmf p4 = poisson.pmf(ci[t], mu=lambda_[k, t]) #print p1, p2, p3, p4, lambda_[k,t], ci[t] comb_p_list[comb_index] = p1 * p2 * p3 * p4 # }}} # rescale to [0,1] comb_p_list = comb_p_list / comb_p_list.sum() # sample for i-th word comb_index = np.random.choice(np.arange(comb_p_list.size), p=comb_p_list) t, k, a = comb_list[comb_index] t_states[iter_, i] = t z_states[iter_, i] = k a_states[iter_, i] = a #}}} END for i-th TOKEN # update lambda after each iteration for k in k_range: k_indices = np.where(z_states[iter_, :] == k)[0] for t in t_range: t_indices = np.where(t_states[iter_, :] == t)[0] kt_indices = np.intersect1d(k_indices, t_indices) d_indices = W[kt_indices, 1] # if no word is assigned to topic k and timestamp t, keep it as before if d_indices.size > 0: lambda_[k, t] = C[d_indices, t].mean() # }}} # 4. obtain \theta, \phi, and \psi # burn-in: first half z_samples = z_states[1:, :][self.n_iter / 2:, :] a_samples = a_states[1:, :][self.n_iter / 2:, :] t_samples = t_states[1:, :][self.n_iter / 2:, :] # author-topic theta = np.zeros((A, self.K), dtype=np.float_) # topic-word phi = np.zeros((self.K, V), dtype=np.float_) # topic-timestamp psi = np.zeros((self.K, T), dtype=np.float_) for a in a_range: den = self.K * self.alpha + (a_samples == a).sum() a_x, a_y = np.where(a_samples == a) for k in k_range: n_a_k = (z_samples[a_x, a_y] == k).sum() theta[a, k] = float(n_a_k + self.alpha) / (den) for k in k_range: k_count = (z_samples == k).sum() den_v = V * self.beta + k_count den_t = T * self.pi + k_count # x is iteration number, y is word index k_x, k_y = np.where(z_samples == k) for v in v_range: n_k_v = (W[k_y, 0] == v).sum() phi[k, v] = float(n_k_v + self.beta) / den_v for t in t_range: n_k_t = (t_samples[k_x, k_y] == t).sum() psi[k, t] = float(n_k_t + self.pi) / den_t # update lambda t_x, t_y = np.where(t_samples == t) kt_indices = np.intersect1d(k_y, t_y) d_indices = W[kt_indices, 1] # if no word is assigned to topic k and timestamp t, keep it as before if d_indices.size > 0: lambda_[k, t] = C[d_indices, t].mean() self.theta = theta self.phi = phi self.psi = psi self.lambda_ = lambda_ self.z_samples = z_samples self.a_samples = a_samples self.t_samples = t_samples # }}} return theta, phi, psi, lambda_