class SIFTER(object): def __init__(self): """ Constructor """ self.phylo_graph = BioBayesGraph() self.evidence_processors = {} def load_phylogeny(self, phylo_file, phylo_format='phyloxml'): """ """ if phylo_format == 'phyloxml': self.phylo_graph.populate_from_phyloxml(phylo_file) elif phylo_format == 'newick': self.phylo_graph.populate_from_newick(phylo_file) else: raise Exception, "Phylo format requested isn't supported." def load_evidence_processor(self, evidence_type, evidence_processor_class, processor_settings): ''' Loads evidence processor to internal reference ''' self.evidence_processors[evidence_type] = evidence_processor_class(processor_settings) def parse_evidence(self, evidence_type, evidence_file, evidence_constraints, evidence_format): """ """ if not evidence_type in self.evidence_processors: raise Exception, "Evidence type requested doesn't have a handler." return self.evidence_processors[evidence_type].parse_evidence(\ evidence_file=evidence_file, evidence_format=evidence_format, evidence_constraints=evidence_constraints) def setup_nodes(self, node_to_fcn_model_map): ''' Node to fcn_model_map is a function mapping "vertex_id" to { 'auxiliary_info':{'num_functions':num_fcns, 'max_num_simul':3}, 'prob_dist_class':prob_dist_class } E.g. node_to_fcn_model_map = \ lambda v: { \ 'auxiliary_info':{'num_functions':num_fcns, 'max_num_simul':3}, 'prob_dist_class':'FunctionModels.Sifter2.FunctionModel' } ''' dist_fcn_classes = {} for n in self.phylo_graph.g.vertices(): node_index = int(n) fcn_model_info = node_to_fcn_model_map(node_index) # Store auxiliary info by node self.phylo_graph.set_node_auxiliary_information(node_index=node_index, auxiliary_info=fcn_model_info['auxiliary_info']) # Make an instance of the custom prob dist function dist_model = fcn_model_info['prob_dist_class'] if dist_model.__name__ not in dist_fcn_classes: self.phylo_graph.add_prob_dist(prob_dist_class=dist_model) dist_fcn_classes[dist_model.__name__] = dist_model(None,None,None,None) dist_inst = dist_fcn_classes[dist_model.__name__] # Query the number of variables from the custom function self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=1) # Query the domain of each variable from the custom function protein_states = [f for f in dist_inst.possible_protein_states(\ fcn_variants_cnt=fcn_model_info['auxiliary_info']['num_functions'], max_fcn_cnt=fcn_model_info['auxiliary_info']['max_num_simul'])] self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[protein_states]) # Store the distribution function in the graph for the node. self.phylo_graph.set_node_probability_dist(node_index=node_index, prob_dist_class=fcn_model_info['prob_dist_class'].__name__) def process_evidence(self, evidence_type, evidence_set, evidence_constraints): ''' Incorporates evidence into the graph using the appropriate processor ''' if not evidence_type in self.evidence_processors: raise Exception, "Evidence type requested doesn't have a handler." return self.evidence_processors[evidence_type].process_evidence(\ evidence_set=evidence_set, evidence_constraints=evidence_constraints)
class Test_inference(unittest.TestCase): """ Test class for creating graphical model scaffolds from phylogeny files """ def setUp(self): """ Loads a phylogeny. """ phylo_file = os.path.dirname(os.path.realpath(__file__)) + "/example_data/Asp_protease_2.xml" self.phylo_graph = BioBayesGraph() self.graph = self.phylo_graph.populate_from_phyloxml(phylo_file) # Incorporates the code for the ProbDist1 class into the graph class ProbDist1(object): def __init__(self, graph, node, node_to_name_map): # graph, node are respectively: # http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Graph # http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Vertex # node_to_name_map is a python dictionary in which # any named node's index (can get by int(node_of_interest)) # will map to the phylogenetic name associated. (If exists) self.graph = graph self.node = node self.name_to_node_map = node_to_name_map def compute_virtual_likelihood(self, vals, auxiliary_info): # "vals" is vector of the particular values this node # is taking. # # "auxiliary_info" is the custom information provided # when the virtual evidence was specified. return 1 def compute_pd(self, vals): # Returns the conditional probability for this node at vals. # Get parent node(s): parents = [] for p_node in self.node.in_neighbours(): parents.append(int(p_node)) # Note that you shape this depending on node location and # other properties in the graph. # Also, you can store computations into class-wide variables # (e.g. ClassName.var_to_store) to cache computations. You # could also declare the variable being stored to as global. return 1 self.phylo_graph.add_prob_dist(prob_dist_class=ProbDist1) # Sets all nodes to have two, variables # first with 3 values, second with two values. for node in self.graph.vertices(): node_index = int(node) # Each node has v1, v2 self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=2) # v1 \in {0,1,2}, v2 \in {0,1} self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[(0, 1, 2), (0, 1)]) # Use the same probability dist (defined in the class above) self.phylo_graph.set_node_probability_dist(node_index=node_index, prob_dist_class="ProbDist1") def testInference(self): """ Runs a query using libdai. """ # Creates one "hard" observation, and one "virtual" observation self.phylo_graph.clear_all_evidence() self.phylo_graph.add_hard_evidence( node_index=self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), observed_value=(0, 1) # v1 = 0, v2 = 1 ) self.phylo_graph.add_virtual_evidence( node_index=self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), observed_value=(2, 0), # v1 = 2, v2 = 0 auxiliary_info={"custom_info"}, # info provided to likelihood function ) # phylo_graph.remove_evidence_at_node(node_index=phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299")) self.phylo_graph.create_inference_representation() query_nodes = [ self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"), # Some other node self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), # Set as virtual observation above self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), ] # Set as hard observation above q_results = self.phylo_graph.inference_query(query_nodes=query_nodes) expected = { self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"): ((0, 0), 0.166666666667), self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"): ((0, 1), 1.0), self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"): ((0, 1), 0.166666666667), } for qn, marginals in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) for var_val, marg_val in marginals: print var_val, ":", marg_val if var_val == expected[qn][0]: self.assertAlmostEqual(marg_val, expected[qn][1]) def testLeaveOneOut(self): """ Tests leave-one-out inference looping """ # Creates one "hard" observation, and one "virtual" observation self.phylo_graph.clear_all_evidence() self.phylo_graph.add_hard_evidence( node_index=self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), observed_value=(0, 1) # v1 = 0, v2 = 1 ) self.phylo_graph.add_virtual_evidence( node_index=self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), observed_value=(2, 0), # v1 = 2, v2 = 0 auxiliary_info={"custom_info"}, # info provided to likelihood function ) q_results = self.phylo_graph.inference_query_leave_one_out() for qn, left_out_results in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) pprint(left_out_results) print "------\n" query_nodes = [ self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"), # Some other node self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), # Set as virtual observation above self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), ] # Set as hard observation above q_results = self.phylo_graph.inference_query(query_nodes=query_nodes) expected = { self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"): ((0, 0), 0.166666666667), self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"): ((0, 1), 1.0), self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"): ((0, 1), 0.166666666667), } for qn, marginals in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) for var_val, marg_val in marginals: print var_val, ":", marg_val if var_val == expected[qn][0]: self.assertAlmostEqual(marg_val, expected[qn][1]) print "------\n" q_results = self.phylo_graph.inference_query_leave_one_out() for qn, left_out_results in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) pprint(left_out_results) print "------\n" assert 1 == 2
class Test_nodedistributionsetup(unittest.TestCase): """ Test class for creating graphical model scaffolds from phylogeny files """ def setUp(self): """ Loads a phylogeny. """ phylo_file = os.path.dirname(os.path.realpath(__file__)) + "/example_data/Asp_protease_2.xml" self.phylo_graph = BioBayesGraph() self.graph = self.phylo_graph.populate_from_phyloxml(phylo_file) def testSetLeafNodeVariableInitialization(self): """ Tries to set up graphical model leaf node variables properly. """ # Sets all nodes to have 3 variables each. # And defines the domain of each explicitly card_sum1 = 0.0 for node_index in self.phylo_graph.iterleafnodes(): # print graph.vertex_properties["name"][graph.vertex(node_index)] self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=3) # So var1 can take values in 0 or 1, # var2 can take values in {0,1,2,3} # and var3 can take values in {'a','b','c'} self.phylo_graph.set_node_variable_domains( node_index=node_index, var_domains=[(0, 1), (0, 1, 2, 3), ("a", "b", "c")] ) for v_ind in range(self.phylo_graph.get_node_variable_count(node_index)): card_sum1 += len(self.phylo_graph.get_node_variable_domain(node_index, v_ind)) self.assertEqual(card_sum1, 2745.0, "Leaf node variable cardinality sum check didn't pass") # 305*(2+4+3) def testSetInternalNodeVariableInitialization(self): """ Tries to set up graphical model internal node variables properly. """ card_sum2 = 0.0 for node_index in self.phylo_graph.iterinternalnodes(): # print graph.vertex_properties["name"][graph.vertex(node_index)] self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=3) # So var1 can take values in 0 or 1, # var2 can take values in {0,1,2,3} # and var3 can take values in {'a','b','c'} self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[(0, 1), (0, 1), (0, 1)]) for v_ind in range(self.phylo_graph.get_node_variable_count(node_index)): card_sum2 += len(self.phylo_graph.get_node_variable_domain(node_index, v_ind)) self.assertEqual(card_sum2, 1818.0, "Internal node variable cardinality sum check didn't pass") # 303*(2+2+2) def testProbabilityDistInitialization(self): """ Tries to set probability distributions for leaf nodes. """ self.testSetLeafNodeVariableInitialization() card_sum1 = 0.0 for node_index in self.phylo_graph.iterleafnodes(): # print graph.vertex_properties["name"][graph.vertex(node_index)] self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=3) for v_ind in range(self.phylo_graph.get_node_variable_count(node_index)): card_sum1 += len(self.phylo_graph.get_node_variable_domain(node_index, v_ind)) self.assertEqual(card_sum1, 2745.0, "Leaf node variable cardinality sum check didn't pass") # 305*(2+4+3)