def get_tables(self): """ Adds tables to the network. Example ------- >>> writer = UAIWriter(model) >>> writer.get_tables() """ if isinstance(self.model, BayesianModel): cpds = self.model.get_cpds() cpds.sort(key=lambda x: x.variable) tables = [] for cpd in cpds: values = list(map(str, cpd.values.ravel())) tables.append(values) return tables elif isinstance(self.model, MarkovModel): factors = self.model.get_factors() tables = [] for factor in factors: values = list(map(str, factor.values.ravel())) tables.append(values) return tables else: raise TypeError( "Model must be an instance of Markov or Bayesian model.")
def get_distributions(self): """ Returns a dictionary of name and its distribution. Distribution is a ndarray. The ndarray is stored in the standard way such that the rightmost variable changes most often. Consider a CPD of variable 'd' which has parents 'b' and 'c' (distribution['CONDSET'] = ['b', 'c']) | d_0 d_1 --------------------------- b_0, c_0 | 0.8 0.2 b_0, c_1 | 0.9 0.1 b_1, c_0 | 0.7 0.3 b_1, c_1 | 0.05 0.95 The value of distribution['d']['DPIS'] for the above example will be: array([[ 0.8 , 0.2 ], [ 0.9 , 0.1 ], [ 0.7 , 0.3 ], [ 0.05, 0.95]]) Examples -------- >>> reader = XBNReader('xbn_test.xml') >>> reader.get_distributions() {'a': {'TYPE': 'discrete', 'DPIS': array([[ 0.2, 0.8]])}, 'e': {'TYPE': 'discrete', 'DPIS': array([[ 0.8, 0.2], [ 0.6, 0.4]]), 'CONDSET': ['c'], 'CARDINALITY': [2]}, 'b': {'TYPE': 'discrete', 'DPIS': array([[ 0.8, 0.2], [ 0.2, 0.8]]), 'CONDSET': ['a'], 'CARDINALITY': [2]}, 'c': {'TYPE': 'discrete', 'DPIS': array([[ 0.2 , 0.8 ], [ 0.05, 0.95]]), 'CONDSET': ['a'], 'CARDINALITY': [2]}, 'd': {'TYPE': 'discrete', 'DPIS': array([[ 0.8 , 0.2 ], [ 0.9 , 0.1 ], [ 0.7 , 0.3 ], [ 0.05, 0.95]]), 'CONDSET': ['b', 'c']}, 'CARDINALITY': [2, 2]} """ distribution = {} for dist in self.bnmodel.find('DISTRIBUTIONS'): variable_name = dist.find('PRIVATE').get('NAME') distribution[variable_name] = {'TYPE': dist.get('TYPE')} if dist.find('CONDSET') is not None: distribution[variable_name]['CONDSET'] = [var.get('NAME') for var in dist.find('CONDSET').findall('CONDELEM')] distribution[variable_name]['CARDINALITY'] = np.array( [len(set(np.array([list(map(int, dpi.get('INDEXES').split())) for dpi in dist.find('DPIS')])[:, i])) for i in range(len(distribution[variable_name]['CONDSET']))]) distribution[variable_name]['DPIS'] = np.array( [list(map(float, dpi.text.split())) for dpi in dist.find('DPIS')]) return distribution
def setUp(self): edges = [['family-out', 'dog-out'], ['bowel-problem', 'dog-out'], ['family-out', 'light-on'], ['dog-out', 'hear-bark']] cpds = { 'bowel-problem': np.array([[0.01], [0.99]]), 'dog-out': np.array([[0.99, 0.01, 0.97, 0.03], [0.9, 0.1, 0.3, 0.7]]), 'family-out': np.array([[0.15], [0.85]]), 'hear-bark': np.array([[0.7, 0.3], [0.01, 0.99]]), 'light-on': np.array([[0.6, 0.4], [0.05, 0.95]]) } states = { 'bowel-problem': ['true', 'false'], 'dog-out': ['true', 'false'], 'family-out': ['true', 'false'], 'hear-bark': ['true', 'false'], 'light-on': ['true', 'false'] } parents = { 'bowel-problem': [], 'dog-out': ['family-out', 'bowel-problem'], 'family-out': [], 'hear-bark': ['dog-out'], 'light-on': ['family-out'] } properties = { 'bowel-problem': ['position = (335, 99)'], 'dog-out': ['position = (300, 195)'], 'family-out': ['position = (257, 99)'], 'hear-bark': ['position = (296, 268)'], 'light-on': ['position = (218, 195)'] } self.model = BayesianModel(edges) tabular_cpds = [] for var in sorted(cpds.keys()): values = cpds[var] cpd = TabularCPD(var, len(states[var]), values, evidence=parents[var], evidence_card=[ len(states[evidence_var]) for evidence_var in parents[var] ]) tabular_cpds.append(cpd) self.model.add_cpds(*tabular_cpds) for node, properties in properties.items(): for prop in properties: prop_name, prop_value = map(lambda t: t.strip(), prop.split('=')) self.model.node[node][prop_name] = prop_value self.writer = BIFWriter(model=self.model)
def set_distributions(self): """ Set distributions in the network. Examples -------- >>> from pgm.readwrite.XMLBeliefNetwork import XBNWriter >>> writer =XBNWriter() >>> writer.set_distributions() """ distributions = etree.SubElement(self.bnmodel, 'DISTRIBUTIONS') cpds = self.model.get_cpds() cpds.sort(key=lambda x: x.variable) for cpd in cpds: cpd_values = cpd.values.ravel() var = cpd.variable dist = etree.SubElement(distributions, 'DIST', attrib={'TYPE': self.model.node[var]['TYPE']}) etree.SubElement(dist, 'PRIVATE', attrib={'NAME': var}) dpis = etree.SubElement(dist, 'DPIS') evidence = cpd.variables[:0:-1] if evidence: condset = etree.SubElement(dist, 'CONDSET') for condelem in sorted(evidence): etree.SubElement(condset, 'CONDELEM', attrib={'NAME': condelem}) # TODO: Get Index value. for val in range(0, len(cpd_values), 2): etree.SubElement(dpis, "DPI", attrib={'INDEXES': ' '}).text = \ " " + str(cpd_values[val]) + " " + str(cpd_values[val+1]) + " " else: etree.SubElement(dpis, "DPI").text = ' ' + ' '.join(map(str, cpd_values))
def get_cpd(self): """ Returns the CPD of the variables present in the network Examples -------- >>> reader = XMLBIF.XMLBIFReader("xmlbif_test.xml") >>> reader.get_cpd() {'bowel-problem': array([[ 0.01], [ 0.99]]), 'dog-out': array([[ 0.99, 0.01, 0.97, 0.03], [ 0.9 , 0.1 , 0.3 , 0.7 ]]), 'family-out': array([[ 0.15], [ 0.85]]), 'hear-bark': array([[ 0.7 , 0.3 ], [ 0.01, 0.99]]), 'light-on': array([[ 0.6 , 0.4 ], [ 0.05, 0.95]])} """ variable_CPD = { definition.find('FOR').text: list(map(float, table.text.split())) for definition in self.network.findall('DEFINITION') for table in definition.findall('TABLE') } for variable in variable_CPD: arr = np.array(variable_CPD[variable]) arr = arr.reshape( (len(self.variable_states[variable]), arr.size // len(self.variable_states[variable])), order='F') variable_CPD[variable] = arr return variable_CPD
def _str(self, phi_or_p="phi", tablefmt="fancy_grid", print_state_names=True): """ Generate the string from `__str__` method. Parameters ---------- phi_or_p: 'phi' | 'p' 'phi': When used for Factors. 'p': When used for CPDs. print_state_names: boolean If True, the user defined state names are displayed. """ string_header = list(map(lambda x: six.text_type(x), self.scope())) string_header.append('{phi_or_p}({variables})'.format(phi_or_p=phi_or_p, variables=','.join(string_header))) value_index = 0 factor_table = [] for prob in product(*[range(card) for card in self.cardinality]): if self.state_names and print_state_names: prob_list = ["{var}({state})".format( var=list(self.variables)[i], state=self.state_names[list( self.variables)[i]][prob[i]]) for i in range(len(self.variables))] else: prob_list = ["{s}_{d}".format(s=list(self.variables)[i], d=prob[i]) for i in range(len(self.variables))] prob_list.append(self.values.ravel()[value_index]) factor_table.append(prob_list) value_index += 1 return tabulate(factor_table, headers=string_header, tablefmt=tablefmt, floatfmt=".4f")
def sample_discrete(values, weights, size=1): """ Generate a sample of given size, given a probability mass function. Parameters ---------- values: numpy.array: Array of all possible values that the random variable can take. weights: numpy.array or list of numpy.array: Array(s) representing the PMF of the random variable. size: int: Size of the sample to be generated. Returns ------- numpy.array: of values of the random variable sampled from the given PMF. Example ------- >>> import numpy as np >>> from pgm.utils.mathext import sample_discrete >>> values = np.array(['v_0', 'v_1', 'v_2']) >>> probabilities = np.array([0.2, 0.5, 0.3]) >>> sample_discrete(values, probabilities, 10) array(['v_1', 'v_1', 'v_0', 'v_1', 'v_2', 'v_0', 'v_1', 'v_1', 'v_1', 'v_2'], dtype='<U3') """ weights = np.array(weights) if weights.ndim == 1: return np.random.choice(values, size=size, p=weights) else: return np.fromiter(map(lambda t: np.random.choice(values, p=t), weights), dtype='int')
def get_model(self): model = BayesianModel(self.get_edges()) model.name = self.network_name tabular_cpds = [] for var, values in self.variable_CPD.items(): evidence_card = [ len(self.variable_states[evidence_var]) for evidence_var in self.variable_parents[var] ] cpd = TabularCPD(var, len(self.variable_states[var]), values, evidence=self.variable_parents[var], evidence_card=evidence_card, state_names=self.get_states()) tabular_cpds.append(cpd) model.add_cpds(*tabular_cpds) for node, properties in self.variable_property.items(): for prop in properties: if prop is not None: prop_name, prop_value = map(lambda t: t.strip(), prop.split('=')) model.node[node][prop_name] = prop_value return model
def get_model(self): """ Returns an instance of Bayesian Model or Markov Model. Varibles are in the pattern var_0, var_1, var_2 where var_0 is 0th index variable, var_1 is 1st index variable. Return ------ model: an instance of Bayesian or Markov Model. Examples -------- >>> reader = UAIReader('TestUAI.uai') >>> reader.get_model() """ if self.network_type == 'BAYES': model = BayesianModel(self.edges) tabular_cpds = [] for cpd in self.tables: child_var = cpd[0] states = int(self.domain[child_var]) arr = list(map(float, cpd[1])) values = np.array(arr) values = values.reshape(states, values.size // states) tabular_cpds.append(TabularCPD(child_var, states, values)) model.add_cpds(*tabular_cpds) return model elif self.network_type == 'MARKOV': model = MarkovModel(self.edges) factors = [] for table in self.tables: variables = table[0] cardinality = [int(self.domain[var]) for var in variables] value = list(map(float, table[1])) factor = DiscreteFactor(variables=variables, cardinality=cardinality, values=value) factors.append(factor) model.add_factors(*factors) return model
def setUp(self): self.maxDiff = None edges = [['family-out', 'dog-out'], ['bowel-problem', 'dog-out'], ['family-out', 'light-on'], ['dog-out', 'hear-bark']] cpds = {'bowel-problem': np.array([[0.01], [0.99]]), 'dog-out': np.array([[0.99, 0.01, 0.97, 0.03], [0.9, 0.1, 0.3, 0.7]]), 'family-out': np.array([[0.15], [0.85]]), 'hear-bark': np.array([[0.7, 0.3], [0.01, 0.99]]), 'light-on': np.array([[0.6, 0.4], [0.05, 0.95]])} states = {'bowel-problem': ['true', 'false'], 'dog-out': ['true', 'false'], 'family-out': ['true', 'false'], 'hear-bark': ['true', 'false'], 'light-on': ['true', 'false']} parents = {'bowel-problem': [], 'dog-out': ['bowel-problem', 'family-out'], 'family-out': [], 'hear-bark': ['dog-out'], 'light-on': ['family-out']} self.bayesmodel = BayesianModel(edges) tabular_cpds = [] for var, values in cpds.items(): cpd = TabularCPD(var, len(states[var]), values, evidence=parents[var], evidence_card=[len(states[evidence_var]) for evidence_var in parents[var]]) tabular_cpds.append(cpd) self.bayesmodel.add_cpds(*tabular_cpds) self.bayeswriter = UAIWriter(self.bayesmodel) edges = {('var_0', 'var_1'), ('var_0', 'var_2'), ('var_1', 'var_2')} self.markovmodel = MarkovModel(edges) tables = [(['var_0', 'var_1'], ['4.000', '2.400', '1.000', '0.000']), (['var_0', 'var_1', 'var_2'], ['2.2500', '3.2500', '3.7500', '0.0000', '0.0000', '10.0000', '1.8750', '4.0000', '3.3330', '2.0000', '2.0000', '3.4000'])] domain = {'var_1': '2', 'var_2': '3', 'var_0': '2'} factors = [] for table in tables: variables = table[0] cardinality = [int(domain[var]) for var in variables] values = list(map(float, table[1])) factor = DiscreteFactor(variables, cardinality, values) factors.append(factor) self.markovmodel.add_factors(*factors) self.markovwriter = UAIWriter(self.markovmodel)
def _find_size_of_clique(clique, cardinalities): """ Computes the size of a clique. Size of a clique is defined as product of cardinalities of all the nodes present in the clique. """ return list( map(lambda x: np.prod([cardinalities[node] for node in x]), clique))
def get_model(self): """ Returns the model instance of the ProbModel. Return --------------- model: an instance of BayesianModel. Examples ------- >>> reader = ProbModelXMLReader() >>> reader.get_model() """ if self.probnet.get('type') == "BayesianNetwork": model = BayesianModel(self.probnet['edges'].keys()) tabular_cpds = [] cpds = self.probnet['Potentials'] for cpd in cpds: var = list(cpd['Variables'].keys())[0] states = self.probnet['Variables'][var]['States'] evidence = cpd['Variables'][var] evidence_card = [ len(self.probnet['Variables'][evidence_var]['States']) for evidence_var in evidence ] arr = list(map(float, cpd['Values'].split())) values = np.array(arr) values = values.reshape( (len(states), values.size // len(states))) tabular_cpds.append( TabularCPD(var, len(states), values, evidence, evidence_card)) model.add_cpds(*tabular_cpds) variables = model.nodes() for var in variables: for prop_name, prop_value in self.probnet['Variables'][ var].items(): model.node[var][prop_name] = prop_value edges = model.edges() for edge in edges: for prop_name, prop_value in self.probnet['edges'][edge].items( ): model.edge[edge[0]][edge[1]][prop_name] = prop_value return model else: raise ValueError("Please specify only Bayesian Network.")
def forward_sample(self, size=1): """ Generates sample(s) from joint distribution of the bayesian network. Parameters ---------- size: int size of sample to be generated Returns ------- sampled: pandas.DataFrame the generated samples Examples -------- >>> from pgm.models.BayesianModel import BayesianModel >>> from pgm.factors.discrete import TabularCPD >>> from pgm.sampling import BayesianModelSampling >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]]) >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]]) >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, ... 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], ... ['intel', 'diff'], [2, 2]) >>> student.add_cpds(cpd_d, cpd_i, cpd_g) >>> inference = BayesianModelSampling(student) >>> inference.forward_sample(2) diff intel grade 0 1 0 1 1 1 0 2 """ sampled = DataFrame(index=range(size), columns=self.topological_order) for node in self.topological_order: cpd = self.model.get_cpds(node) states = range(self.cardinality[node]) evidence = cpd.variables[:0:-1] if evidence: cached_values = self.pre_compute_reduce(variable=node) evidence = sampled.ix[:, evidence].values weights = list(map(lambda t: cached_values[tuple(t)], evidence)) else: weights = cpd.values sampled[node] = sample_discrete(states, weights, size) return sampled
def get_model(self): """ Returns the fitted bayesian model Example ---------- >>> from pgm.readwrite import BIFReader >>> reader = BIFReader("bif_test.bif") >>> reader.get_model() <pgm.models.BayesianModel.BayesianModel object at 0x7f20af154320> """ try: model = BayesianModel(self.variable_edges) model.name = self.network_name model.add_nodes_from(self.variable_names) tabular_cpds = [] for var in sorted(self.variable_cpds.keys()): values = self.variable_cpds[var] cpd = TabularCPD( var, len(self.variable_states[var]), values, evidence=self.variable_parents[var], evidence_card=[ len(self.variable_states[evidence_var]) for evidence_var in self.variable_parents[var] ]) tabular_cpds.append(cpd) model.add_cpds(*tabular_cpds) for node, properties in self.variable_properties.items(): for prop in properties: prop_name, prop_value = map(lambda t: t.strip(), prop.split('=')) model.node[node][prop_name] = prop_value return model except AttributeError: raise AttributeError( 'First get states of variables, edges, parents and network name' )
def _align_column(strings, alignment, minwidth=0, has_invisible=True): """[string] -> [padded_string] >>> list(map(str,_align_column(["12.345", "-1234.5", "1.23", "1234.5", "1e+234", "1.0e234"], "decimal"))) [' 12.345 ', '-1234.5 ', ' 1.23 ', ' 1234.5 ', ' 1e+234 ', ' 1.0e234'] >>> list(map(str,_align_column(['123.4', '56.7890'], None))) ['123.4', '56.7890'] """ if alignment == "right": strings = [s.strip() for s in strings] padfn = _padleft elif alignment == "center": strings = [s.strip() for s in strings] padfn = _padboth elif alignment == "decimal": if has_invisible: decimals = [_afterpoint(_strip_invisible(s)) for s in strings] else: decimals = [_afterpoint(s) for s in strings] maxdecimals = max(decimals) strings = [s + (maxdecimals - decs) * " " for s, decs in zip(strings, decimals)] padfn = _padleft elif not alignment: return strings else: strings = [s.strip() for s in strings] padfn = _padright if has_invisible: width_fn = _visible_width else: width_fn = len maxwidth = max(max(map(width_fn, strings)), minwidth) padded_strings = [padfn(maxwidth, s, has_invisible) for s in strings] return padded_strings
def __str__(self): """ Returns the BIF format as string """ network_template, variable_template, property_template, probability_template = self.BIF_templates( ) network = '' network += network_template.substitute(name=self.network_name) variables = self.model.nodes() for var in sorted(variables): no_of_states = str(len(self.variable_states[var])) states = ', '.join(self.variable_states[var]) if not self.property_tag[var]: properties = '' else: properties = '' for prop_val in self.property_tag[var]: properties += property_template.substitute(prop=prop_val) network += variable_template.substitute(name=var, no_of_states=no_of_states, states=states, properties=properties) for var in sorted(variables): if not self.variable_parents[var]: parents = '' seprator = '' else: parents = ', '.join(self.variable_parents[var]) seprator = ' | ' cpd = ', '.join(map(str, self.tables[var])) network += probability_template.substitute(variable_=var, seprator_=seprator, parents=parents, values=cpd) return network
def _latex_row(cell_values, colwidths, colaligns): def escape_char(c): return LATEX_ESCAPE_RULES.get(c, c) escaped_values = ["".join(map(escape_char, cell)) for cell in cell_values] rowfmt = DataRow("", "&", "\\\\") return _build_simple_row(escaped_values, rowfmt)
def to_junction_tree(self): """ Creates a junction tree (or clique tree) for a given markov model. For a given markov model (H) a junction tree (G) is a graph 1. where each node in G corresponds to a maximal clique in H 2. each sepset in G separates the variables strictly on one side of the edge to other. Examples -------- >>> from pgm.models import MarkovModel >>> from pgm.factors.discrete import DiscreteFactor >>> mm = MarkovModel() >>> mm.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']) >>> mm.add_edges_from([('x1', 'x3'), ('x1', 'x4'), ('x2', 'x4'), ... ('x2', 'x5'), ('x3', 'x6'), ('x4', 'x6'), ... ('x4', 'x7'), ('x5', 'x7')]) >>> phi = [DiscreteFactor(edge, [2, 2], np.random.rand(4)) for edge in mm.edges()] >>> mm.add_factors(*phi) >>> junction_tree = mm.to_junction_tree() """ from pgm.models import JunctionTree # Check whether the model is valid or not self.check_model() # Triangulate the graph to make it chordal triangulated_graph = self.triangulate() # Find maximal cliques in the chordal graph cliques = list(map(tuple, nx.find_cliques(triangulated_graph))) # If there is only 1 clique, then the junction tree formed is just a # clique tree with that single clique as the node if len(cliques) == 1: clique_trees = JunctionTree() clique_trees.add_node(cliques[0]) # Else if the number of cliques is more than 1 then create a complete # graph with all the cliques as nodes and weight of the edges being # the length of sepset between two cliques elif len(cliques) >= 2: complete_graph = UndirectedGraph() edges = list(itertools.combinations(cliques, 2)) weights = list( map(lambda x: len(set(x[0]).intersection(set(x[1]))), edges)) for edge, weight in zip(edges, weights): complete_graph.add_edge(*edge, weight=-weight) # Create clique trees by minimum (or maximum) spanning tree method clique_trees = JunctionTree( nx.minimum_spanning_tree(complete_graph).edges()) # Check whether the factors are defined for all the random variables or not all_vars = itertools.chain( *[factor.scope() for factor in self.factors]) if set(all_vars) != set(self.nodes()): ValueError( 'DiscreteFactor for all the random variables not specified') # Dictionary stating whether the factor is used to create clique # potential or not # If false, then it is not used to create any clique potential is_used = {factor: False for factor in self.factors} for node in clique_trees.nodes(): clique_factors = [] for factor in self.factors: # If the factor is not used in creating any clique potential as # well as has any variable of the given clique in its scope, # then use it in creating clique potential if not is_used[factor] and set(factor.scope()).issubset(node): clique_factors.append(factor) is_used[factor] = True # To compute clique potential, initially set it as unity factor var_card = [self.get_cardinality()[x] for x in node] clique_potential = DiscreteFactor(node, var_card, np.ones(np.product(var_card))) # multiply it with the factors associated with the variables present # in the clique (or node) clique_potential *= factor_product(*clique_factors) clique_trees.add_factors(clique_potential) if not all(is_used.values()): raise ValueError( 'All the factors were not used to create Junction Tree.' 'Extra factors are defined.') return clique_trees
def likelihood_weighted_sample(self, evidence=None, size=1): """ Generates weighted sample(s) from joint distribution of the bayesian network, that comply with the given evidence. 'Probabilistic Graphical Model Principles and Techniques', Koller and Friedman, Algorithm 12.2 pp 493. Parameters ---------- evidence: list of `pgm.factor.State` namedtuples None if no evidence size: int size of sample to be generated Returns ------- sampled: pandas.DataFrame the generated samples with corresponding weights Examples -------- >>> from pgm.factors.discrete import State >>> from pgm.models.BayesianModel import BayesianModel >>> from pgm.factors.discrete import TabularCPD >>> from pgm.sampling import BayesianModelSampling >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]]) >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]]) >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, ... 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], ... ['intel', 'diff'], [2, 2]) >>> student.add_cpds(cpd_d, cpd_i, cpd_g) >>> inference = BayesianModelSampling(student) >>> evidence = [State('diff', 0)] >>> inference.likelihood_weighted_sample(evidence, 2) intel diff grade _weight 0 0 0 1 0.6 1 1 0 1 0.6 """ sampled = DataFrame(index=range(size), columns=self.topological_order) sampled['_weight'] = np.ones(size) evidence_dict = {var: st for var, st in evidence} for node in self.topological_order: cpd = self.model.get_cpds(node) states = range(self.cardinality[node]) evidence = cpd.get_evidence() if evidence: evidence_values = sampled.ix[:, evidence].values cached_values = self.pre_compute_reduce(node) weights = list( map(lambda t: cached_values[tuple(t)], evidence_values)) if node in evidence_dict: sampled[node] = evidence_dict[node] for i in range(size): sampled.loc[i, '_weight'] *= weights[i][ evidence_dict[node]] else: sampled[node] = sample_discrete(states, weights) else: if node in evidence_dict: sampled[node] = evidence_dict[node] for i in range(size): sampled.loc[i, '_weight'] *= cpd.values[ evidence_dict[node]] else: sampled[node] = sample_discrete(states, cpd.values, size) return sampled
def _normalize_tabular_data(tabular_data, headers): """Transform a supported data type to a list of lists, and a list of headers. Supported tabular data types: * list-of-lists or another iterable of iterables * list of named tuples (usually used with headers="keys") * list of dicts (usually used with headers="keys") * list of OrderedDicts (usually used with headers="keys") * 2D NumPy arrays * NumPy record arrays (usually used with headers="keys") * dict of iterables (usually used with headers="keys") * pandas.DataFrame (usually used with headers="keys") The first row can be used as headers if headers="firstrow", column indices can be used as headers if headers="keys". """ if hasattr(tabular_data, "keys") and hasattr(tabular_data, "values"): # dict-like and pandas.DataFrame? if hasattr(tabular_data.values, "__call__"): # likely a conventional dict keys = tabular_data.keys() rows = list(izip_longest(*tabular_data.values())) # columns have to be transposed elif hasattr(tabular_data, "index"): # values is a property, has .index => it's likely a pandas.DataFrame (pandas 0.11.0) keys = tabular_data.keys() vals = tabular_data.values # values matrix doesn't need to be transposed names = tabular_data.index rows = [[v]+list(row) for v,row in zip(names, vals)] else: raise ValueError("tabular data doesn't appear to be a dict or a DataFrame") if headers == "keys": headers = list(map(_text_type,keys)) # headers should be strings else: # it's a usual an iterable of iterables, or a NumPy array rows = list(tabular_data) if (headers == "keys" and hasattr(tabular_data, "dtype") and getattr(tabular_data.dtype, "names")): # numpy record array headers = tabular_data.dtype.names elif (headers == "keys" and len(rows) > 0 and isinstance(rows[0], tuple) and hasattr(rows[0], "_fields")): # namedtuple headers = list(map(_text_type, rows[0]._fields)) elif (len(rows) > 0 and isinstance(rows[0], dict)): # dict or OrderedDict uniq_keys = set() # implements hashed lookup keys = [] # storage for set if headers == "firstrow": firstdict = rows[0] if len(rows) > 0 else {} keys.extend(firstdict.keys()) uniq_keys.update(keys) rows = rows[1:] for row in rows: for k in row.keys(): #Save unique items in input order if k not in uniq_keys: keys.append(k) uniq_keys.add(k) if headers == 'keys': headers = keys elif isinstance(headers, dict): # a dict of headers for a list of dicts headers = [headers.get(k, k) for k in keys] headers = list(map(_text_type, headers)) elif headers == "firstrow": if len(rows) > 0: headers = [firstdict.get(k, k) for k in keys] headers = list(map(_text_type, headers)) else: headers = [] elif headers: raise ValueError('headers for a list of dicts is not a dict or a keyword') rows = [[row.get(k) for k in keys] for row in rows] elif headers == "keys" and len(rows) > 0: # keys are column indices headers = list(map(_text_type, range(len(rows[0])))) # take headers from the first row if necessary if headers == "firstrow" and len(rows) > 0: headers = list(map(_text_type, rows[0])) # headers should be strings rows = rows[1:] headers = list(map(_text_type,headers)) rows = list(map(list,rows)) # pad with empty headers for initial columns if necessary if headers and len(rows) > 0: nhs = len(headers) ncols = len(rows[0]) if nhs < ncols: headers = [""]*(ncols - nhs) + headers return rows, headers
def tabulate(tabular_data, headers=[], tablefmt="simple", floatfmt="g", numalign="decimal", stralign="left", missingval=""): """Format a fixed width table for pretty printing. >>> print(tabulate([[1, 2.34], [-56, "8.999"], ["2", "10001"]])) --- --------- 1 2.34 -56 8.999 2 10001 --- --------- The first required argument (`tabular_data`) can be a list-of-lists (or another iterable of iterables), a list of named tuples, a dictionary of iterables, an iterable of dictionaries, a two-dimensional NumPy array, NumPy record array, or a Pandas' dataframe. Table headers ------------- To print nice column headers, supply the second argument (`headers`): - `headers` can be an explicit list of column headers - if `headers="firstrow"`, then the first row of data is used - if `headers="keys"`, then dictionary keys or column indices are used Otherwise a headerless table is produced. If the number of headers is less than the number of columns, they are supposed to be names of the last columns. This is consistent with the plain-text format of R and Pandas' dataframes. >>> print(tabulate([["sex","age"],["Alice","F",24],["Bob","M",19]], ... headers="firstrow")) sex age ----- ----- ----- Alice F 24 Bob M 19 Column alignment ---------------- `tabulate` tries to detect column types automatically, and aligns the values properly. By default it aligns decimal points of the numbers (or flushes integer numbers to the right), and flushes everything else to the left. Possible column alignments (`numalign`, `stralign`) are: "right", "center", "left", "decimal" (only for `numalign`), and None (to disable alignment). Table formats ------------- `floatfmt` is a format specification used for columns which contain numeric data with a decimal point. `None` values are replaced with a `missingval` string: >>> print(tabulate([["spam", 1, None], ... ["eggs", 42, 3.14], ... ["other", None, 2.7]], missingval="?")) ----- -- ---- spam 1 ? eggs 42 3.14 other ? 2.7 ----- -- ---- Various plain-text table formats (`tablefmt`) are supported: 'plain', 'simple', 'grid', 'pipe', 'orgtbl', 'rst', 'mediawiki', 'latex', and 'latex_booktabs'. Variable `tabulate_formats` contains the list of currently supported formats. "plain" format doesn't use any pseudographics to draw tables, it separates columns with a double space: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "plain")) strings numbers spam 41.9999 eggs 451 >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="plain")) spam 41.9999 eggs 451 "simple" format is like Pandoc simple_tables: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "simple")) strings numbers --------- --------- spam 41.9999 eggs 451 >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="simple")) ---- -------- spam 41.9999 eggs 451 ---- -------- "grid" is similar to tables produced by Emacs table.el package or Pandoc grid_tables: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "grid")) +-----------+-----------+ | strings | numbers | +===========+===========+ | spam | 41.9999 | +-----------+-----------+ | eggs | 451 | +-----------+-----------+ >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="grid")) +------+----------+ | spam | 41.9999 | +------+----------+ | eggs | 451 | +------+----------+ "fancy_grid" draws a grid using box-drawing characters: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "fancy_grid")) ╒═══════════╤═══════════╕ │ strings │ numbers │ ╞═══════════╪═══════════╡ │ spam │ 41.9999 │ ├───────────┼───────────┤ │ eggs │ 451 │ ╘═══════════╧═══════════╛ "pipe" is like tables in PHP Markdown Extra extension or Pandoc pipe_tables: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "pipe")) | strings | numbers | |:----------|----------:| | spam | 41.9999 | | eggs | 451 | >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="pipe")) |:-----|---------:| | spam | 41.9999 | | eggs | 451 | "orgtbl" is like tables in Emacs org-mode and orgtbl-mode. They are slightly different from "pipe" format by not using colons to define column alignment, and using a "+" sign to indicate line intersections: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "orgtbl")) | strings | numbers | |-----------+-----------| | spam | 41.9999 | | eggs | 451 | >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="orgtbl")) | spam | 41.9999 | | eggs | 451 | "rst" is like a simple table format from reStructuredText; please note that reStructuredText accepts also "grid" tables: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "rst")) ========= ========= strings numbers ========= ========= spam 41.9999 eggs 451 ========= ========= >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="rst")) ==== ======== spam 41.9999 eggs 451 ==== ======== "mediawiki" produces a table markup used in Wikipedia and on other MediaWiki-based sites: >>> print(tabulate([["strings", "numbers"], ["spam", 41.9999], ["eggs", "451.0"]], ... headers="firstrow", tablefmt="mediawiki")) {| class="wikitable" style="text-align: left;" |+ <!-- caption --> |- ! strings !! align="right"| numbers |- | spam || align="right"| 41.9999 |- | eggs || align="right"| 451 |} "html" produces HTML markup: >>> print(tabulate([["strings", "numbers"], ["spam", 41.9999], ["eggs", "451.0"]], ... headers="firstrow", tablefmt="html")) <table> <tr><th>strings </th><th style="text-align: right;"> numbers</th></tr> <tr><td>spam </td><td style="text-align: right;"> 41.9999</td></tr> <tr><td>eggs </td><td style="text-align: right;"> 451 </td></tr> </table> "latex" produces a tabular environment of LaTeX document markup: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="latex")) \\begin{tabular}{lr} \\hline spam & 41.9999 \\\\ eggs & 451 \\\\ \\hline \\end{tabular} "latex_booktabs" produces a tabular environment of LaTeX document markup using the booktabs.sty package: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="latex_booktabs")) \\begin{tabular}{lr} \\toprule spam & 41.9999 \\\\ eggs & 451 \\\\ \\bottomrule \end{tabular} """ if tabular_data is None: tabular_data = [] list_of_lists, headers = _normalize_tabular_data(tabular_data, headers) # optimization: look for ANSI control codes once, # enable smart width functions only if a control code is found plain_text = '\n'.join(['\t'.join(map(_text_type, headers))] + \ ['\t'.join(map(_text_type, row)) for row in list_of_lists]) has_invisible = re.search(_invisible_codes, plain_text) if has_invisible: width_fn = _visible_width else: width_fn = len # format rows and columns, convert numeric values to strings cols = list(zip(*list_of_lists)) coltypes = list(map(_column_type, cols)) cols = [[_format(v, ct, floatfmt, missingval, has_invisible) for v in c] for c,ct in zip(cols, coltypes)] # align columns aligns = [numalign if ct in [int,float] else stralign for ct in coltypes] minwidths = [width_fn(h) + MIN_PADDING for h in headers] if headers else [0]*len(cols) cols = [_align_column(c, a, minw, has_invisible) for c, a, minw in zip(cols, aligns, minwidths)] if headers: # align headers and add headers t_cols = cols or [['']] * len(headers) t_aligns = aligns or [stralign] * len(headers) minwidths = [max(minw, width_fn(c[0])) for minw, c in zip(minwidths, t_cols)] headers = [_align_header(h, a, minw) for h, a, minw in zip(headers, t_aligns, minwidths)] rows = list(zip(*cols)) else: minwidths = [width_fn(c[0]) for c in cols] rows = list(zip(*cols)) if not isinstance(tablefmt, TableFormat): tablefmt = _table_formats.get(tablefmt, _table_formats["simple"]) return _format_table(tablefmt, headers, rows, minwidths, aligns)