def global_position(self): #don't need to move origin to get rotation as this translation is implicit in the other measurements #change so only executed once when the local aoa is set. if isinf(self.local_position): return_value = -inf else: # This linear algebra is nice but actually all that we need to do is add the delta between the global reading # and the local reading to get the correct result should be less expensive in terms of calculation as well return_value = modulo_heading( self.local_position + self.sensor_properties.location.heading) #local_sensor_position = array([sqrt(2) * cos(self.local_aoa), sqrt(2) * sin(self.local_aoa)]) #global_particle_position = dot(linalg.inv(self.sensor_properties.rotation_matrix), local_sensor_position) #return_value = cartesian_to_polar(global_particle_position[0][0],global_particle_position[0][1], # response_units=AngleUnits.radians)[1] return return_value
def global_position(self): #don't need to move origin to get rotation as this translation is implicit in the other measurements #change so only executed once when the local aoa is set. if isinf(self.local_position): return_value = -inf else: # This linear algebra is nice but actually all that we need to do is add the delta between the global reading # and the local reading to get the correct result should be less expensive in terms of calculation as well return_value = modulo_heading(self.local_position + self.sensor_properties.location.heading) #local_sensor_position = array([sqrt(2) * cos(self.local_aoa), sqrt(2) * sin(self.local_aoa)]) #global_particle_position = dot(linalg.inv(self.sensor_properties.rotation_matrix), local_sensor_position) #return_value = cartesian_to_polar(global_particle_position[0][0],global_particle_position[0][1], # response_units=AngleUnits.radians)[1] return return_value
def local_position(self, arg_aoa): if isinf(arg_aoa): self._aoa = -inf else: self._aoa = modulo_heading(arg_aoa)
def isinf(x): if isinstance(x, IDataDescriptor): return _um.isinf(dd_as_py(x)) else: return _um.isinf(x)
def get_format_func(self, elem, **options): missing_opt = self.check_options(**options) if missing_opt: raise Exception("Missing options: {}".format(missing_opt)) floatmode = options['floatmode'] precision = None if floatmode == 'unique' else options['precision'] suppress_small = options['suppress_small'] sign = options['sign'] infstr = options['infstr'] nanstr = options['nanstr'] exp_format = False pad_left, pad_right = 0, 0 # only the finite values are used to compute the number of digits finite = umath.isfinite(elem) finite_vals = elem[finite] nonfinite_vals = elem[~finite] # choose exponential mode based on the non-zero finite values: abs_non_zero = umath.absolute(finite_vals[finite_vals != 0]) if len(abs_non_zero) != 0: max_val = np.max(abs_non_zero) min_val = np.min(abs_non_zero) with np.errstate(over='ignore'): # division can overflow if max_val >= 1.e8 or (not suppress_small and (min_val < 0.0001 or max_val / min_val > 1000.)): exp_format = True # do a first pass of printing all the numbers, to determine sizes if len(finite_vals) == 0: trim, exp_size, unique = '.', -1, True elif exp_format: trim, unique = '.', True if floatmode == 'fixed': trim, unique = 'k', False strs = (format_float_scientific(x, precision=precision, unique=unique, trim=trim, sign=sign == '+') for x in finite_vals) frac_strs, _, exp_strs = zip(*(s.partition('e') for s in strs)) int_part, frac_part = zip(*(s.split('.') for s in frac_strs)) exp_size = max(len(s) for s in exp_strs) - 1 trim = 'k' precision = max(len(s) for s in frac_part) # this should be only 1 or 2. Can be calculated from sign. pad_left = max(len(s) for s in int_part) # pad_right is only needed for nan length calculation pad_right = exp_size + 2 + precision unique = False else: trim, unique = '.', True if floatmode == 'fixed': trim, unique = 'k', False strs = (format_float_positional(x, precision=precision, fractional=True, unique=unique, trim=trim, sign=sign == '+') for x in finite_vals) int_part, frac_part = zip(*(s.split('.') for s in strs)) pad_left = max(len(s) for s in int_part) pad_right = max(len(s) for s in frac_part) exp_size = -1 if floatmode in ['fixed', 'maxprec_equal']: precision = pad_right unique = False trim = 'k' else: unique = True trim = '.' # account for sign = ' ' by adding one to pad_left if sign == ' ' and not any(np.signbit(finite_vals)): pad_left += 1 # account for nan and inf in pad_left if len(nonfinite_vals) != 0: nanlen, inflen = 0, 0 if np.any(umath.isinf(nonfinite_vals)): neginf = sign != '-' or np.any(np.isneginf(nonfinite_vals)) inflen = len(infstr) + neginf if np.any(umath.isnan(elem)): nanlen = len(nanstr) offset = pad_right + 1 # +1 for decimal pt pad_left = max(nanlen - offset, inflen - offset, pad_left) def print_nonfinite(x): with errstate(invalid='ignore'): if umath.isnan(x): ret = ('+' if sign == '+' else '') + nanstr else: # isinf infsgn = '-' if x < 0 else '+' if sign == '+' else '' ret = infsgn + infstr return ' ' * (pad_left + pad_right + 1 - len(ret)) + ret if exp_format: def print_finite(x): return format_float_scientific(x, precision=precision, unique=unique, trim=trim, sign=sign == '+', pad_left=pad_left, exp_digits=exp_size) else: def print_finite(x): return format_float_positional(x, precision=precision, unique=unique, fractional=True, trim=trim, sign=sign == '+', pad_left=pad_left, pad_right=pad_right) def fmt(x): if umath.isfinite(x): return print_finite(x) else: return print_nonfinite(x) return fmt
def isinf(x): if isinstance(x, DDesc): return _um.isinf(ddesc_as_py(x)) else: return _um.isinf(x)
def sample(self, num_iterations): """ for num_iters: for each customer remove him from his old_table and update the table params. if old_table is empty: remove table Calculate prior and likelihood for this customer sitting at each table sample for a table index if new_table is equal to old_table don't have to update the parameters else update params of the old table. """ if self.show_topics is not None: print("Topics after initialization") print(self.format_topics()) # Compute the overall usage of topics across the training corpus topic_props = self.table_counts_per_doc.sum(axis=1).astype( np.float64) topic_props /= topic_props.sum() print("Words using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_props * 100.)))) topic_doc_props = (self.table_counts_per_doc > 0).astype( np.float64).sum(axis=1) topic_doc_props /= self.num_documents print("Docs using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_doc_props * 100.)))) with VoseAliasUpdater( self.aliases, self.vocab_embeddings, self.prior.kappa, self.prior.nu, self.table_counts, self.table_means, self.table_cholesky_ltriangular_mat, self.log_determinants, das_normalization=self.das_normalization, ) as alias_updater: for iteration in range(num_iterations): stats = SamplingDiagnostics() self.log.info("Iteration {}".format(iteration)) alias_updater.unpause() pbar = get_progress_bar(len(self.corpus), title="Sampling", show_progress=self.show_progress) for d, doc in enumerate(pbar(self.corpus)): if self.show_topics is not None and self.show_topics > 0 and d % self.show_topics == 0: print("Topics after {:,} docs".format(d)) print(self.format_topics()) for w, cust_id in enumerate(doc): x = self.vocab_embeddings[cust_id] # Remove custId from his old_table old_table_id = self.table_assignments[d][w] self.table_assignments[d][ w] = -1 # Doesn't really make any difference, as only counts are used with self.table_counts.lock: self.table_counts.np[old_table_id] -= 1 self.table_counts_per_doc[old_table_id, d] -= 1 # Update vector means etc self.sum_squared_table_customers[ old_table_id] -= np.outer(x, x) # Topic 'old_table_id' now has one member fewer # Just update params for this customer self.update_table_params(old_table_id, cust_id, is_removed=True) # Under the alias method, we only do the full likelihood computation for topics # that already have a non-zero count in the current document non_zero_tables = np.where( self.table_counts_per_doc[:, d] > 0)[0] if len(non_zero_tables) == 0: # If there's only one word in a doc, there are no topics to compute the full posterior for no_non_zero = True else: no_non_zero = False # We only compute the posterior for these topics log_priors = np.log( self.table_counts_per_doc[non_zero_tables, d]) log_likelihoods = np.zeros(len(non_zero_tables), dtype=np.float32) for nz_table, table in enumerate(non_zero_tables): log_likelihoods[ nz_table] = self.log_multivariate_tdensity( x, table) log_posterior = log_priors + log_likelihoods # To prevent overflow, subtract by log(p_max) max_log_posterior = log_posterior.max() scaled_posterior = log_posterior - max_log_posterior if self.das_normalization: # Not doing this now, but following what the Java impl does, however odd that seems psum = np.sum(np.exp(scaled_posterior)) else: # Java impl subtracts max before computing psum, but this seems to be wrong # We still subtract first, but then multiply by the max prob afterwards psum = np.exp( np.log(np.sum(np.exp(scaled_posterior))) + max_log_posterior) # Now just use the scaled log posterior in the same way as in the Java impl # They have a bin-search method for sampling from the cumulative dist, # but we simply normalize and use Numpy to sample unnormed_posterior = np.exp(scaled_posterior) normed_posterior = unnormed_posterior / unnormed_posterior.sum( ) # Don't let the alias parameters get updated in the middle of the sampling self.aliases.lock.acquire_read(cust_id) select_pr = psum / ( psum + self.alpha * self.aliases.likelihood_sum.np[cust_id]) # MHV to draw new topic # Take a number of Metropolis-Hastings samples current_sample = old_table_id # Calculate the true likelihood of this word under the current sample, # for calculating acceptance prob current_sample_log_prob = self.log_multivariate_tdensity( x, current_sample) for r in range(self.mh_steps): # 1. Flip a coin if not no_non_zero and np.random.sample( ) < select_pr: # Choose from the exactly computed posterior dist, only allowing # topics already sampled in the doc temp = np.random.choice(len(non_zero_tables), p=normed_posterior) new_sample = non_zero_tables[temp] stats.log_select_pr(True, select_pr) else: # Choose from the alias, allowing any topic but using slightly # out-of-date likelihoods new_sample = self.aliases.sample_vose(cust_id) stats.log_select_pr(False, select_pr) if new_sample != current_sample: # 2. Find acceptance probability new_sample_log_prob = self.log_multivariate_tdensity( x, new_sample) # This can sometimes generate an overflow warning from Numpy # We don't care, though: in that case acceptance > 1., so we always accept with np.errstate(over="ignore"): # From my reading of: # Li et al. (2014): Reducing the sampling complexity of topic models # the acceptance probability should be as follows: acceptance = \ (self.table_counts_per_doc[new_sample, d] + self.alpha) / \ (self.table_counts_per_doc[current_sample, d] + self.alpha) * \ np.exp(new_sample_log_prob - current_sample_log_prob) * \ (self.table_counts_per_doc[current_sample, d]*np.exp(current_sample_log_prob) + self.alpha*np.exp(self.aliases.log_likelihoods.np[cust_id, current_sample])) / \ (self.table_counts_per_doc[new_sample, d]*np.exp(new_sample_log_prob) + self.alpha*np.exp(self.aliases.log_likelihoods.np[cust_id, new_sample])) # The Java implementation, however, does this: #acceptance = \ # (self.table_counts_per_doc[new_table_id, d] + self.alpha) / \ # (self.table_counts_per_doc[current_sample, d] + self.alpha) * \ # np.exp(new_prob - old_prob) * \ # (self.table_counts_per_doc[current_sample, d]*old_log_prob + # self.alpha*alias.w.np[current_sample]) / \ # (self.table_counts_per_doc[new_table_id, d]*new_log_prob + # self.alpha*alias.w.np[new_table_id]) # The difference is the Java impl doesn't exp the log likelihood in the last # fraction, i.e. it uses a log prob instead of a prob # 3. Compare against uniform[0,1] # If the acceptance prob > 1, we always accept: this means the new sample # has a higher probability than the old if isinf( acceptance ) or acceptance >= 1. or np.random.sample( ) < acceptance: # No need to sample if acceptance >= 1 # If the acceptance prob < 1, sample whether to accept or not, such that # the more likely the new sample is compared to the old, the more likely we # are to keep it current_sample = new_sample current_sample_log_prob = new_sample_log_prob stats.log_acceptance(True, acceptance) else: stats.log_acceptance(False, acceptance) # NOTE: There seems to be a small error in the Java implementation here # On the last MH step, it doesn't make any difference whether we accept the # sample or not - we always end up using it self.aliases.lock.release_read() if current_sample == old_table_id: stats.log_sampled_same() else: stats.log_sampled_different() # Now have a new assignment: add its counts self.table_assignments[d][w] = current_sample with self.table_counts.lock: self.table_counts.np[current_sample] += 1 self.table_counts_per_doc[current_sample, d] += 1 self.sum_squared_table_customers[ current_sample] += np.outer(x, x) self.update_table_params(current_sample, cust_id) # Pause the alias updater until we start the next iteration alias_updater.pause() # Output some useful stats about sampling if stats.acceptance_used(): self.log.info( "Acceptance rate = {:.2f}%, mean acceptance: {:.2f} ({:,} samples draw)" .format(stats.acceptance_rate() * 100., stats.mean_acceptance(), stats.acceptance_samples())) else: self.log.info("No new samples drawn") self.log.info( "Prior select rate = {:.2f}%, mean select_pr = {:.2f}". format(stats.select_pr_rate() * 100., stats.mean_select_pr())) self.log.info("Chose new sample: {:.2f}%".format( stats.sample_change_rate() * 100.)) if self.show_topics is not None: print("Topics after iteration {}".format(iteration)) print(self.format_topics()) # Compute the overall usage of topics across the training corpus topic_props = self.table_counts_per_doc.sum(axis=1).astype( np.float64) topic_props /= topic_props.sum() print("Words using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_props * 100.)))) topic_doc_props = (self.table_counts_per_doc > 0).astype( np.float64).sum(axis=1) topic_doc_props /= self.num_documents print("Docs using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_doc_props * 100.)))) if self.save_path is not None: self.log.info("Saving model") self.save()