def bql_row_predictive_relevance( bdb, population_id, generator_id, modelnos, rowid_target, rowid_query, colno, *constraint_args): if rowid_target is None: raise BQLError(bdb, 'No such target row for SIMILARITY') rowid_query = json.loads(rowid_query) modelnos = _retrieve_modelnos(modelnos) # Build the list of hypothetical values. # Each sequence of values is separated by None to demarcate between rows. splits = [-1] + [i for i, x in enumerate(constraint_args) if x is None] assert splits[-1] == len(constraint_args) - 1 rows_list = [ constraint_args[splits[i]+1:splits[i+1]] for i in range(len(splits)-1) ] assert all(len(row)%2 == 0 for row in rows_list) hypotheticals = [zip(row[::2], row[1::2]) for row in rows_list] if len(rowid_query) == 0 and len(hypotheticals) == 0: raise BQLError(bdb, 'No matching rows for PREDICTIVE RELEVANCE.') def generator_similarity(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) return backend.predictive_relevance( bdb, generator_id, modelnos, rowid_target, rowid_query, hypotheticals, colno) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) sims = map(generator_similarity, generator_ids) return stats.arithmetic_mean([stats.arithmetic_mean(s) for s in sims])
def bql_row_predictive_relevance(bdb, population_id, generator_id, modelnos, rowid_target, rowid_query, colno, *constraint_args): if rowid_target is None: raise BQLError(bdb, 'No such target row for SIMILARITY') rowid_query = json.loads(rowid_query) modelnos = _retrieve_modelnos(modelnos) # Build the list of hypothetical values. # Each sequence of values is separated by None to demarcate between rows. splits = [-1] + [i for i, x in enumerate(constraint_args) if x is None] assert splits[-1] == len(constraint_args) - 1 rows_list = [ constraint_args[splits[i] + 1:splits[i + 1]] for i in range(len(splits) - 1) ] assert all(len(row) % 2 == 0 for row in rows_list) hypotheticals = [zip(row[::2], row[1::2]) for row in rows_list] if len(rowid_query) == 0 and len(hypotheticals) == 0: raise BQLError(bdb, 'No matching rows for PREDICTIVE RELEVANCE.') def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.predictive_relevance(bdb, generator_id, modelnos, rowid_target, rowid_query, hypotheticals, colno) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) sims = map(generator_similarity, generator_ids) return stats.arithmetic_mean([stats.arithmetic_mean(s) for s in sims])
def bql_column_mutual_information(bdb, population_id, generator_id, colno0, colno1, numsamples, *constraint_args): mutinfs = _bql_column_mutual_information(bdb, population_id, generator_id, colno0, colno1, numsamples, *constraint_args) # XXX This integral of the CMI returned by each model of all generators in # in the population is wrong! At least, it does not directly correspond to # any meaningful probabilistic quantity, other than literally the mean CMI # averaged over all population models. return stats.arithmetic_mean([stats.arithmetic_mean(m) for m in mutinfs])
def bql_column_mutual_information( bdb, population_id, generator_id, modelnos, colnos0, colnos1, numsamples, *constraint_args): colnos0 = json.loads(colnos0) colnos1 = json.loads(colnos1) modelnos = _retrieve_modelnos(modelnos) mutinfs = _bql_column_mutual_information( bdb, population_id, generator_id, modelnos, colnos0, colnos1, numsamples, *constraint_args) # XXX This integral of the CMI returned by each model of all generators in # in the population is wrong! At least, it does not directly correspond to # any meaningful probabilistic quantity, other than literally the mean CMI # averaged over all population models. return stats.arithmetic_mean([stats.arithmetic_mean(m) for m in mutinfs])
def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Change [colno] to colno by updating IBayesDBMetamodel. similarity_list = metamodel.row_similarity(bdb, generator_id, modelnos, rowid, target_rowid, [colno]) return stats.arithmetic_mean(similarity_list)
def column_mutual_information(self, bdb, generator_id, modelno, colno0, colno1, constraints=None, numsamples=None): # XXX Default number of samples drawn from my arse. if numsamples is None: numsamples = 1000 # Get the engine. engine = self._engine(bdb, generator_id) # Engine gives us a list of samples which it is our # responsibility to integrate over. mi_list = engine.mutual_information(colno0, colno1, evidence=constraints, N=numsamples, multiprocess=self._ncpu) # XXX Is this integral correct? Should it be weighted? return arithmetic_mean(mi_list)
def column_mutual_information(self, bdb, generator_id, modelnos, colnos0, colnos1, constraints, numsamples): population_id = bayesdb_generator_population(bdb, generator_id) colnames0 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos0 ] colnames1 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos1 ] server = self._get_preql_server(bdb, generator_id) target_set = server._cols_to_mask(server.encode_set(colnames0)) query_set = server._cols_to_mask(server.encode_set(colnames1)) if self._marginize_cmi(constraints): inner_numsamples = numsamples conditioning_rows_loom_format = self._get_constraint_rows( constraints, bdb, generator_id, population_id, modelnos, server, inner_numsamples) else: conditioning_rows_loom_format = [ self._get_constraint_row(constraints, bdb, generator_id, population_id, server) ] mi_estimates = [ server._query_server.mutual_information( target_set, query_set, entropys=None, sample_count=loom.preql.SAMPLE_COUNT, conditioning_row=conditioning_row_loom_format).mean for conditioning_row_loom_format in conditioning_rows_loom_format ] # Output requires an iterable. return [arithmetic_mean(mi_estimates)]
def bql_column_dependence_probability(bdb, population_id, generator_id, colno0, colno1): def generator_depprob(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.column_dependence_probability(bdb, generator_id, None, colno0, colno1) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) depprobs = map(generator_depprob, generator_ids) return stats.arithmetic_mean(depprobs)
def bql_column_dependence_probability( bdb, population_id, generator_id, modelnos, colno0, colno1): modelnos = _retrieve_modelnos(modelnos) def generator_depprob(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) depprob_list = backend.column_dependence_probability( bdb, generator_id, modelnos, colno0, colno1) return stats.arithmetic_mean(depprob_list) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) depprobs = map(generator_depprob, generator_ids) return stats.arithmetic_mean(depprobs)
def bql_row_similarity( bdb, population_id, generator_id, modelnos, rowid, target_rowid, colno): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') modelnos = _retrieve_modelnos(modelnos) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Change [colno] to colno by updating IBayesDBMetamodel. return metamodel.row_similarity( bdb, generator_id, modelnos, rowid, target_rowid, [colno]) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def bql_column_dependence_probability(bdb, population_id, generator_id, modelnos, colno0, colno1): modelnos = _retrieve_modelnos(modelnos) def generator_depprob(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) depprob_list = backend.column_dependence_probability( bdb, generator_id, modelnos, colno0, colno1) return stats.arithmetic_mean(depprob_list) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) depprobs = map(generator_depprob, generator_ids) return stats.arithmetic_mean(depprobs)
def bql_row_similarity( bdb, population_id, generator_id, modelnos, rowid, target_rowid, colno): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') modelnos = _retrieve_modelnos(modelnos) def generator_similarity(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) # XXX Change [colno] to colno by updating BayesDB_Backend. similarity_list = backend.row_similarity( bdb, generator_id, modelnos, rowid, target_rowid, [colno]) return stats.arithmetic_mean(similarity_list) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def bql_column_mutual_information( bdb, population_id, generator_id, colno0, colno1, numsamples, *constraint_args): if len(constraint_args) % 2 == 1: raise ValueError('Odd constraint arguments: %s.' % (constraint_args)) constraints = dict(zip(constraint_args[::2], constraint_args[1::2])) \ if constraint_args else None def generator_mutinf(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.column_mutual_information(bdb, generator_id, None, colno0, colno1, constraints=constraints, numsamples=numsamples) generator_ids = [generator_id] if generator_id is not None else \ core.bayesdb_population_generators(bdb, population_id) mutinfs = map(generator_mutinf, generator_ids) return stats.arithmetic_mean(mutinfs)
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid, *colnos): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') if len(colnos) == 0: colnos = core.bayesdb_variable_numbers(bdb, population_id, generator_id) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.row_similarity(bdb, generator_id, None, rowid, target_rowid, colnos) generator_ids = [generator_id] if generator_id is not None else \ core.bayesdb_population_generators(bdb, population_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def column_dependence_probability(self, bdb, generator_id, modelno, colno0, colno1): # Optimize special-case vacuous case of self-dependence. # XXX Caller should avoid this. if colno0 == colno1: return 1 # Get the engine. engine = self._engine(bdb, generator_id) # Engine gives us a list of dependence probabilities which it is our # responsibility to integrate over. depprob_list = engine.dependence_probability( colno0, colno1, multiprocess=self._multiprocess) return arithmetic_mean(depprob_list)
def row_similarity(self, bdb, generator_id, modelno, rowid, target_rowid, colnos): # Map the variable and individual indexing. cgpm_rowid = self._cgpm_rowid(bdb, generator_id, rowid) cgpm_target_rowid = self._cgpm_rowid(bdb, generator_id, target_rowid) # Get the engine. engine = self._engine(bdb, generator_id) # Engine gives us a list of similarities which it is our # responsibility to integrate over. similarity_list = engine.row_similarity(cgpm_rowid, cgpm_target_rowid, colnos, multiprocess=self._ncpu) return arithmetic_mean(similarity_list)
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid, *colnos): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') if len(colnos) == 0: colnos = core.bayesdb_variable_numbers(bdb, population_id, generator_id) if len(colnos) != 1: raise BQLError(bdb, 'Multiple with respect to columns: %s.' % (colnos, )) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.row_similarity(bdb, generator_id, None, rowid, target_rowid, colnos) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def column_mutual_information(self, bdb, generator_id, modelnos, colnos0, colnos1, constraints, numsamples): population_id = bayesdb_generator_population(bdb, generator_id) colnames0 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos0 ] colnames1 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos1 ] server = self._get_preql_server(bdb, generator_id) target_set = server._cols_to_mask(server.encode_set(colnames0)) query_set = server._cols_to_mask(server.encode_set(colnames1)) if self._marginize_cmi(constraints): inner_numsamples = numsamples conditioning_rows_loom_format = self._get_constraint_rows( constraints, bdb, generator_id, population_id, modelnos, server, inner_numsamples) else: conditioning_rows_loom_format = [ self._get_constraint_row(constraints, bdb, generator_id, population_id, server) ] mi_estimates = [ server._query_server.mutual_information( target_set, query_set, entropys=None, sample_count=loom.preql.SAMPLE_COUNT, conditioning_row=conditioning_row_loom_format ).mean for conditioning_row_loom_format in conditioning_rows_loom_format ] # Output requires an iterable. return [arithmetic_mean(mi_estimates)]
def generator_depprob(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) depprob_list = backend.column_dependence_probability( bdb, generator_id, modelnos, colno0, colno1) return stats.arithmetic_mean(depprob_list)
def generator_similarity(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) # XXX Change [colno] to colno by updating BayesDB_Backend. similarity_list = backend.row_similarity( bdb, generator_id, modelnos, rowid, target_rowid, [colno]) return stats.arithmetic_mean(similarity_list)
def generator_similarity(generator_id): backend = core.bayesdb_generator_backend(bdb, generator_id) # XXX Change [colno] to colno by updating BayesDB_Backend. similarity_list = backend.row_similarity(bdb, generator_id, modelnos, rowid, target_rowid, [colno]) return stats.arithmetic_mean(similarity_list)
def generator_depprob(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) depprob_list = metamodel.column_dependence_probability( bdb, generator_id, modelnos, colno0, colno1) return stats.arithmetic_mean(depprob_list)