def testPhylogenyFromNewick(self): """ Tries to load a newick tree from file. """ phylo_file = os.path.dirname(os.path.realpath(__file__)) + '/example_data/Asp_protease_2.nhx' phylo_graph = BioBayesGraph() graph = phylo_graph.populate_from_newick(phylo_file) self.assertTrue(("name" in graph.vertex_properties), msg="Clade names not imported correctly.") self.assertTrue(("branch_length" in graph.edge_properties), msg="Branch lengths not imported correctly.") bl_sum = 0.0 for e in graph.edges(): bl_sum += float(graph.edge_properties['branch_length'][e]) self.assertTrue(expr=abs(bl_sum - 168.58699) < 1e-6, msg="Branch lengths not imported correctly "\ +"(sum is wrong)") self.assertEqual(graph.num_vertices(), 608, "Didn't get expected number of nodes from phylogeny.") self.assertEqual(graph.num_edges(), 607, "Didn't get expected number of nodes from phylogeny.")
def testGraphLoad(self): """ Loads graph """ obo_graph = BioBayesGraph() obo_graph.import_from_graphml(os.path.dirname(os.path.realpath(__file__)) + "/example_data/MF.bbg") for k in self.node_prop_types.iterkeys(): self.assertTrue((k in obo_graph.g.vertex_properties), msg="%s not imported correctly."%k)
def testExportFormats(self): """ Tries to create a graph from a phylogeny and export it to GraphML and other formats. """ phylo_graph = BioBayesGraph() phylo_file = os.path.dirname(os.path.realpath(__file__)) + '/example_data/Asp_protease_2.xml' tmp_file = StringIO() try: phylo_graph.populate_from_phyloxml(phylo_file) phylo_graph.export_as_graphml(tmp_file) finally: tmp_file.close()
def setUp(self): """ Loads a phylogeny. """ phylo_file = os.path.dirname(os.path.realpath(__file__)) + "/example_data/Asp_protease_2.xml" self.phylo_graph = BioBayesGraph() self.graph = self.phylo_graph.populate_from_phyloxml(phylo_file)
def __init__(self, processor_settings): ''' For SIFTER 2.0, the molecular function gene ontology is loaded into a graph. ''' self.evidence_ontology = BioBayesGraph() self._load_go_ontology(go_file=processor_settings['go_file'], go_format=processor_settings['go_format'])
def setUp(self): """ Loads a phylogeny. """ phylo_file = os.path.dirname(os.path.realpath(__file__)) + "/example_data/Asp_protease_2.xml" self.phylo_graph = BioBayesGraph() self.graph = self.phylo_graph.populate_from_phyloxml(phylo_file) # Incorporates the code for the ProbDist1 class into the graph class ProbDist1(object): def __init__(self, graph, node, node_to_name_map): # graph, node are respectively: # http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Graph # http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Vertex # node_to_name_map is a python dictionary in which # any named node's index (can get by int(node_of_interest)) # will map to the phylogenetic name associated. (If exists) self.graph = graph self.node = node self.name_to_node_map = node_to_name_map def compute_virtual_likelihood(self, vals, auxiliary_info): # "vals" is vector of the particular values this node # is taking. # # "auxiliary_info" is the custom information provided # when the virtual evidence was specified. return 1 def compute_pd(self, vals): # Returns the conditional probability for this node at vals. # Get parent node(s): parents = [] for p_node in self.node.in_neighbours(): parents.append(int(p_node)) # Note that you shape this depending on node location and # other properties in the graph. # Also, you can store computations into class-wide variables # (e.g. ClassName.var_to_store) to cache computations. You # could also declare the variable being stored to as global. return 1 self.phylo_graph.add_prob_dist(prob_dist_class=ProbDist1) # Sets all nodes to have two, variables # first with 3 values, second with two values. for node in self.graph.vertices(): node_index = int(node) # Each node has v1, v2 self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=2) # v1 \in {0,1,2}, v2 \in {0,1} self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[(0, 1, 2), (0, 1)]) # Use the same probability dist (defined in the class above) self.phylo_graph.set_node_probability_dist(node_index=node_index, prob_dist_class="ProbDist1")
def testOntologyFromGOOBO(self): """ Tries to load a GO OBO ontology from file. """ obo_graph = BioBayesGraph() obo_file = os.path.dirname(os.path.realpath(__file__)) + '/example_data/go_daily-termdb.obo-xml.gz' obo_from_gzip = gzip.open(obo_file, 'rb') # Ontology aspect can be one of: # [u'molecular_function', u'cellular_component', u'biological_process'] graph = obo_graph.populate_from_go_obo_xml(obo_file_buffer=obo_from_gzip, ontology_aspect='molecular_function') obo_from_gzip.close() for k in self.node_prop_types.iterkeys(): self.assertTrue((k in graph.vertex_properties), msg="%s not imported correctly."%k)
class Test_inference(unittest.TestCase): """ Test class for creating graphical model scaffolds from phylogeny files """ def setUp(self): """ Loads a phylogeny. """ phylo_file = os.path.dirname(os.path.realpath(__file__)) + "/example_data/Asp_protease_2.xml" self.phylo_graph = BioBayesGraph() self.graph = self.phylo_graph.populate_from_phyloxml(phylo_file) # Incorporates the code for the ProbDist1 class into the graph class ProbDist1(object): def __init__(self, graph, node, node_to_name_map): # graph, node are respectively: # http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Graph # http://projects.skewed.de/graph-tool/doc/graph_tool.html#graph_tool.Vertex # node_to_name_map is a python dictionary in which # any named node's index (can get by int(node_of_interest)) # will map to the phylogenetic name associated. (If exists) self.graph = graph self.node = node self.name_to_node_map = node_to_name_map def compute_virtual_likelihood(self, vals, auxiliary_info): # "vals" is vector of the particular values this node # is taking. # # "auxiliary_info" is the custom information provided # when the virtual evidence was specified. return 1 def compute_pd(self, vals): # Returns the conditional probability for this node at vals. # Get parent node(s): parents = [] for p_node in self.node.in_neighbours(): parents.append(int(p_node)) # Note that you shape this depending on node location and # other properties in the graph. # Also, you can store computations into class-wide variables # (e.g. ClassName.var_to_store) to cache computations. You # could also declare the variable being stored to as global. return 1 self.phylo_graph.add_prob_dist(prob_dist_class=ProbDist1) # Sets all nodes to have two, variables # first with 3 values, second with two values. for node in self.graph.vertices(): node_index = int(node) # Each node has v1, v2 self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=2) # v1 \in {0,1,2}, v2 \in {0,1} self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[(0, 1, 2), (0, 1)]) # Use the same probability dist (defined in the class above) self.phylo_graph.set_node_probability_dist(node_index=node_index, prob_dist_class="ProbDist1") def testInference(self): """ Runs a query using libdai. """ # Creates one "hard" observation, and one "virtual" observation self.phylo_graph.clear_all_evidence() self.phylo_graph.add_hard_evidence( node_index=self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), observed_value=(0, 1) # v1 = 0, v2 = 1 ) self.phylo_graph.add_virtual_evidence( node_index=self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), observed_value=(2, 0), # v1 = 2, v2 = 0 auxiliary_info={"custom_info"}, # info provided to likelihood function ) # phylo_graph.remove_evidence_at_node(node_index=phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299")) self.phylo_graph.create_inference_representation() query_nodes = [ self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"), # Some other node self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), # Set as virtual observation above self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), ] # Set as hard observation above q_results = self.phylo_graph.inference_query(query_nodes=query_nodes) expected = { self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"): ((0, 0), 0.166666666667), self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"): ((0, 1), 1.0), self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"): ((0, 1), 0.166666666667), } for qn, marginals in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) for var_val, marg_val in marginals: print var_val, ":", marg_val if var_val == expected[qn][0]: self.assertAlmostEqual(marg_val, expected[qn][1]) def testLeaveOneOut(self): """ Tests leave-one-out inference looping """ # Creates one "hard" observation, and one "virtual" observation self.phylo_graph.clear_all_evidence() self.phylo_graph.add_hard_evidence( node_index=self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), observed_value=(0, 1) # v1 = 0, v2 = 1 ) self.phylo_graph.add_virtual_evidence( node_index=self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), observed_value=(2, 0), # v1 = 2, v2 = 0 auxiliary_info={"custom_info"}, # info provided to likelihood function ) q_results = self.phylo_graph.inference_query_leave_one_out() for qn, left_out_results in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) pprint(left_out_results) print "------\n" query_nodes = [ self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"), # Some other node self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"), # Set as virtual observation above self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"), ] # Set as hard observation above q_results = self.phylo_graph.inference_query(query_nodes=query_nodes) expected = { self.phylo_graph.get_node_by_name("C7X6P2_9PORP/206-299"): ((0, 0), 0.166666666667), self.phylo_graph.get_node_by_name("C8SHB6_9RHIZ/82-171"): ((0, 1), 1.0), self.phylo_graph.get_node_by_name("C7PIL1_CHIPD/40-136"): ((0, 1), 0.166666666667), } for qn, marginals in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) for var_val, marg_val in marginals: print var_val, ":", marg_val if var_val == expected[qn][0]: self.assertAlmostEqual(marg_val, expected[qn][1]) print "------\n" q_results = self.phylo_graph.inference_query_leave_one_out() for qn, left_out_results in q_results.iteritems(): print "For node", self.phylo_graph.get_name_by_node(qn) pprint(left_out_results) print "------\n" assert 1 == 2
def __init__(self): """ Constructor """ self.phylo_graph = BioBayesGraph() self.evidence_processors = {}
class SIFTER(object): def __init__(self): """ Constructor """ self.phylo_graph = BioBayesGraph() self.evidence_processors = {} def load_phylogeny(self, phylo_file, phylo_format='phyloxml'): """ """ if phylo_format == 'phyloxml': self.phylo_graph.populate_from_phyloxml(phylo_file) elif phylo_format == 'newick': self.phylo_graph.populate_from_newick(phylo_file) else: raise Exception, "Phylo format requested isn't supported." def load_evidence_processor(self, evidence_type, evidence_processor_class, processor_settings): ''' Loads evidence processor to internal reference ''' self.evidence_processors[evidence_type] = evidence_processor_class(processor_settings) def parse_evidence(self, evidence_type, evidence_file, evidence_constraints, evidence_format): """ """ if not evidence_type in self.evidence_processors: raise Exception, "Evidence type requested doesn't have a handler." return self.evidence_processors[evidence_type].parse_evidence(\ evidence_file=evidence_file, evidence_format=evidence_format, evidence_constraints=evidence_constraints) def setup_nodes(self, node_to_fcn_model_map): ''' Node to fcn_model_map is a function mapping "vertex_id" to { 'auxiliary_info':{'num_functions':num_fcns, 'max_num_simul':3}, 'prob_dist_class':prob_dist_class } E.g. node_to_fcn_model_map = \ lambda v: { \ 'auxiliary_info':{'num_functions':num_fcns, 'max_num_simul':3}, 'prob_dist_class':'FunctionModels.Sifter2.FunctionModel' } ''' dist_fcn_classes = {} for n in self.phylo_graph.g.vertices(): node_index = int(n) fcn_model_info = node_to_fcn_model_map(node_index) # Store auxiliary info by node self.phylo_graph.set_node_auxiliary_information(node_index=node_index, auxiliary_info=fcn_model_info['auxiliary_info']) # Make an instance of the custom prob dist function dist_model = fcn_model_info['prob_dist_class'] if dist_model.__name__ not in dist_fcn_classes: self.phylo_graph.add_prob_dist(prob_dist_class=dist_model) dist_fcn_classes[dist_model.__name__] = dist_model(None,None,None,None) dist_inst = dist_fcn_classes[dist_model.__name__] # Query the number of variables from the custom function self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=1) # Query the domain of each variable from the custom function protein_states = [f for f in dist_inst.possible_protein_states(\ fcn_variants_cnt=fcn_model_info['auxiliary_info']['num_functions'], max_fcn_cnt=fcn_model_info['auxiliary_info']['max_num_simul'])] self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[protein_states]) # Store the distribution function in the graph for the node. self.phylo_graph.set_node_probability_dist(node_index=node_index, prob_dist_class=fcn_model_info['prob_dist_class'].__name__) def process_evidence(self, evidence_type, evidence_set, evidence_constraints): ''' Incorporates evidence into the graph using the appropriate processor ''' if not evidence_type in self.evidence_processors: raise Exception, "Evidence type requested doesn't have a handler." return self.evidence_processors[evidence_type].process_evidence(\ evidence_set=evidence_set, evidence_constraints=evidence_constraints)
class Test_nodedistributionsetup(unittest.TestCase): """ Test class for creating graphical model scaffolds from phylogeny files """ def setUp(self): """ Loads a phylogeny. """ phylo_file = os.path.dirname(os.path.realpath(__file__)) + "/example_data/Asp_protease_2.xml" self.phylo_graph = BioBayesGraph() self.graph = self.phylo_graph.populate_from_phyloxml(phylo_file) def testSetLeafNodeVariableInitialization(self): """ Tries to set up graphical model leaf node variables properly. """ # Sets all nodes to have 3 variables each. # And defines the domain of each explicitly card_sum1 = 0.0 for node_index in self.phylo_graph.iterleafnodes(): # print graph.vertex_properties["name"][graph.vertex(node_index)] self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=3) # So var1 can take values in 0 or 1, # var2 can take values in {0,1,2,3} # and var3 can take values in {'a','b','c'} self.phylo_graph.set_node_variable_domains( node_index=node_index, var_domains=[(0, 1), (0, 1, 2, 3), ("a", "b", "c")] ) for v_ind in range(self.phylo_graph.get_node_variable_count(node_index)): card_sum1 += len(self.phylo_graph.get_node_variable_domain(node_index, v_ind)) self.assertEqual(card_sum1, 2745.0, "Leaf node variable cardinality sum check didn't pass") # 305*(2+4+3) def testSetInternalNodeVariableInitialization(self): """ Tries to set up graphical model internal node variables properly. """ card_sum2 = 0.0 for node_index in self.phylo_graph.iterinternalnodes(): # print graph.vertex_properties["name"][graph.vertex(node_index)] self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=3) # So var1 can take values in 0 or 1, # var2 can take values in {0,1,2,3} # and var3 can take values in {'a','b','c'} self.phylo_graph.set_node_variable_domains(node_index=node_index, var_domains=[(0, 1), (0, 1), (0, 1)]) for v_ind in range(self.phylo_graph.get_node_variable_count(node_index)): card_sum2 += len(self.phylo_graph.get_node_variable_domain(node_index, v_ind)) self.assertEqual(card_sum2, 1818.0, "Internal node variable cardinality sum check didn't pass") # 303*(2+2+2) def testProbabilityDistInitialization(self): """ Tries to set probability distributions for leaf nodes. """ self.testSetLeafNodeVariableInitialization() card_sum1 = 0.0 for node_index in self.phylo_graph.iterleafnodes(): # print graph.vertex_properties["name"][graph.vertex(node_index)] self.phylo_graph.set_node_variable_count(node_index=node_index, num_vars=3) for v_ind in range(self.phylo_graph.get_node_variable_count(node_index)): card_sum1 += len(self.phylo_graph.get_node_variable_domain(node_index, v_ind)) self.assertEqual(card_sum1, 2745.0, "Leaf node variable cardinality sum check didn't pass") # 305*(2+4+3)
class EvidenceProcessor(object): ''' This is the SIFTER 2.0 evidence handling method. ''' def __init__(self, processor_settings): ''' For SIFTER 2.0, the molecular function gene ontology is loaded into a graph. ''' self.evidence_ontology = BioBayesGraph() self._load_go_ontology(go_file=processor_settings['go_file'], go_format=processor_settings['go_format']) def parse_evidence(self, evidence_file, evidence_format, evidence_constraints): ''' Routing function to parse evidence from different format sources. Doesn't process the evidence; only parses the file. ''' if evidence_format == 'pli': go_ev_set = pli_parser.parser(\ evidence_file=evidence_file, evidence_constraints=evidence_constraints) return go_ev_set else: raise Exception, "Evidence format requested isn't supported." def process_evidence(self, evidence_set, evidence_constraints): '''os.devnull Using the parsed evidence, this places the evidence set and modifies the gene ontology graph in the SIFTER 2.0 way. ''' # For each protein in the evidence set, store the annotation # into the evidence graph go_terms = set([]) for pid_json, annot_json in evidence_set.iteritems(): p_ev_set = json.loads(annot_json['evidence_set']) for go_term, moc in p_ev_set: go_terms.add(go_term) annotated_term_nodes = {} for go_term in go_terms: g_node = self.evidence_ontology.get_node_by_name(go_term) if g_node is None: raise Exception, "GO term, %s doesn't seem to be named in your ontology."%go_term annotated_term_nodes[go_term] = self.evidence_ontology.get_node_by_name(go_term) go_subdag = self._get_ontology_subdag(annotated_term_nodes=annotated_term_nodes) #self._visualize_ontology_subdag(go_subdag, "./sub_dag.pdf") processed_ev_set = {} # Now for each protein, add the graphical model evidence for pid_json, annot_json in evidence_set.iteritems(): p_ev_set = json.loads(annot_json['evidence_set']) processed_ev_set[pid_json] = self._distribute_evidence_to_subdag_leaves(\ sub_dag=go_subdag, evidence_constraints=evidence_constraints, protein_evidence_set=p_ev_set) return processed_ev_set def _get_ontology_subdag(self, annotated_term_nodes): """ Given evidence_set, returns a filtered subgraph of self.evidence_ontology that only contains those nodes or their ancestors. """ # For each annotated node, traverse to the root node of the ontology # to include all its less-specific terms all_term_nodes = set([]) for go_term, annot_term in annotated_term_nodes.iteritems(): #print "Tracing:", annot_term, "which is", go_term for generic_term in self._trace_to_ontology_root(self.evidence_ontology.g.vertex(annot_term)): #print "is_a", self.evidence_ontology.g.vertex_properties['go_id'][generic_term],\ # "i.e.", self.evidence_ontology.g.vertex_properties['go_name'][generic_term] all_term_nodes.add(generic_term) sub_dag = graph_tool.GraphView(self.evidence_ontology.g, vfilt=lambda v: v in all_term_nodes) return sub_dag def _trace_to_ontology_root(self, cur_node): """ Generator to recursively visit all nodes on each path from a node up to the root node. """ #print "Graph node:", cur_node yield cur_node for edge_in in cur_node.out_edges(): if self.evidence_ontology.g.edge_properties['edge_type'][edge_in] == 'is_a': for n in self._trace_to_ontology_root(edge_in.target()): yield n def _get_top_node(self, sub_dag): """ Gives the root node of the sub dag. """ for c in sub_dag.vertices(): if c.out_degree() == 0: return c return None def _get_leaves_from_node(self, sub_dag, top_node): descendant_leaves = set() #print "Top node is: %s"%str(top_node) #print "Successors: %s"%str(godag.successors(top_node)) for c in top_node.in_neighbours(): #print "Out degree is: %i"%godag.out_degree(c) if not(c.in_degree() == 0): descendant_leaves = descendant_leaves.union(self._get_leaves_from_node(sub_dag, c)) else: descendant_leaves.add(c) return descendant_leaves def _visualize_ontology_subdag(self, sub_dag, output_file): """ Draws sub-dag to file. """ #http://projects.skewed.de/graph-tool/doc/search_module.html?highlight=leaf #gprops={'forcelabels':'true'}, #vprops={'label':sub_dag.vertex_properties['go_id'],}, #'xlabel':sub_dag.vertex_properties['go_name']}, #vcolor='#00FF00' pos = graph_tool.draw.graphviz_draw(sub_dag, size=(30,30), ratio="fill", layout="dot", vprops={'label':sub_dag.vertex_properties['go_id'],}, #'xlabel':sub_dag.vertex_properties['go_name']}, output="/dev/null/tmp.pdf") return graph_tool.draw.graph_draw(sub_dag, pos=pos, vertex_text=sub_dag.vertex_properties['go_id'], vertex_font_size=8, nodesfirst=True, #vertex_shape="double_circle", vertex_fill_color="#729fcf", vertex_pen_width=3, output=output_file) def _distribute_evidence_to_subdag_leaves(self, sub_dag, protein_evidence_set, evidence_constraints): """ Propagates the evidence in protein_evidence_set over sub_dag and returns a dictionary of {go_term: probability} by distributing the evidence in the SIFTER 2.0 way. """ def prob_or(p1, p2): return 1.0 - (1.0 - p1) * (1 - p2) def binomial(n, k): bc = [1 for i in range(0, k + 1)] for j in range(1, n - k + 1): for i in range(1, k + 1): bc[i] = bc[i - 1] + bc[i] return bc[k] def probability_of_observing_k_nodes(r_value, k): if (k == 0): return 1.0 / r_value prob = 0 for i in range(1, k + 1): prob = prob + binomial(k, i) * 1 / (r_value ** (i)) return prob def calculate_R_value(total_num_leaves): r_value = 1.0 / (2 ** (1.0 / total_num_leaves) - 1) #error_logger.debug("r_value: %f" % r_value) return r_value #print protein_evidence_set # Candidate function set = leaves starting from the root. root_node = self._get_top_node(sub_dag) candidate_fcns = [sub_dag.vertex_properties['go_id'][k] \ for k in self._get_leaves_from_node(sub_dag, root_node)] # Set initial probabilities in DAG for evidence provided by this protein go_term_likelihoods = {sub_dag.vertex_properties['go_id'][k]: \ {'likelihood':0, 'dag_vertex_id':int(k)} for k in sub_dag.vertices()} for go_term, ev_method in protein_evidence_set: dag_node = self.evidence_ontology.get_node_by_name(go_term) go_term_likelihoods[go_term]['likelihood'] = \ prob_or(go_term_likelihoods[go_term]['likelihood'], evidence_constraints[ev_method]) #error_logger.debug("Used %i piece(s) of evidence (%s) to set initial belief to %f for %s" % (len(ev_methods), str(ev_methods), dag_node_descriptor.likelihood, go_term)) # Now for any that are ancestral, propagate the probabilities down in a wonky way r_value = calculate_R_value(len(candidate_fcns)) for go_term, ev_method in protein_evidence_set: dag_node = sub_dag.vertex(go_term_likelihoods[go_term]['dag_vertex_id']) # Skip if is leaf if dag_node.out_degree() == 0: continue descendant_leaf_set = self._get_leaves_from_node(sub_dag, dag_node) #error_logger.debug(" For: %s leaves descendant from this node: %s" % (go_num, descendant_leaf_set)) # Propagate evidence to leaf nodes parent_prob = go_term_likelihoods[go_term]['likelihood'] transmission_coeff = probability_of_observing_k_nodes(r_value, 0) \ / probability_of_observing_k_nodes(r_value, len(descendant_leaf_set)) for leaf_node in descendant_leaf_set: old_likelihood = go_term_likelihoods[sub_dag.vertex_properties['go_id'][k]]['likelihood'] new_likelihood = prob_or(old_likelihood, parent_prob * transmission_coeff) # Store update go_term_likelihoods[sub_dag.vertex_properties['go_id'][k]]['likelihood'] = new_likelihood #error_logger.debug(" Distributed prob to: %s. Child's likelihood went from: %s to %s" % (leaf_node.goid, old_likelihood, evidence_go_num_dict[leaf_node.goid].likelihood)) # This step is performed in Java code, and has the effect of making all # likelihoods non-zero, though underlying reason for doing this is unknown. # error_logger.debug("Leaf Likelihoods before synchronizing: ") # for leaf_go_num in candidate_functions: # error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood)) def synchronize_likelihoods(leaf_go_nums, r_value, evidence_nodes): prob = 0.0 # Calculate probability of observing subset of power set having size of leaf set # Note, this isn't equivalent to: leaf_subset_prior = # probability_of_observing_k_nodes(len(leaf_go_nums)) num_leaves = len(leaf_go_nums) leaf_subset_prior = 0 for i in range(1, num_leaves + 1): leaf_subset_prior = leaf_subset_prior + binomial(num_leaves - 1, i) * 1 / (r_value ** (i)) # Calculate likelihood of ANY leaf likelihood_of_any_leaf = 0.0 for leaf_go_num in leaf_go_nums: likelihood_of_any_leaf = prob_or(likelihood_of_any_leaf, evidence_nodes[leaf_go_num]['likelihood']) # Not entirely sure what's going on here: # Translated from "synchronizeLikelihoods() in PFunGODAG.java. not_in_a_subset_prior = (1.0 - likelihood_of_any_leaf) * leaf_subset_prior for leaf_go_num in leaf_go_nums: current_likelihood = evidence_nodes[leaf_go_num]['likelihood'] new_likelihood = prob_or(current_likelihood, not_in_a_subset_prior) evidence_nodes[leaf_go_num]['likelihood'] = new_likelihood synchronize_likelihoods(candidate_fcns, r_value, go_term_likelihoods) #error_logger.debug("Leaf Likelihoods after synchronizing: ") #for leaf_go_num in candidate_fcns: # error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood)) # Again, this step is performed in Java code and makes all likelihoods # non-zero, though underlying reason for doing this is unknown. def a_priori_evidence(leaf_go_nums, r_value, evidence_nodes): total = 1.0 count_of_unlikely_leaves = 0 total_num_leaves = len(leaf_go_nums) for leaf_go_num in leaf_go_nums: leaf_likelihood = evidence_nodes[leaf_go_num]['likelihood'] if (leaf_likelihood > 0): total = total * leaf_likelihood else: count_of_unlikely_leaves = count_of_unlikely_leaves + 1 if (count_of_unlikely_leaves > 0): rest = (1.0 / (r_value ** total_num_leaves)) / total a = rest ** (1.0 / count_of_unlikely_leaves) for leaf_go_num in leaf_go_nums: leaf_likelihood = evidence_nodes[leaf_go_num]['likelihood'] # For each zero likelihood, we want to fudge factor a bit. if (leaf_likelihood <= 0): leaf_likelihood = a evidence_nodes[leaf_go_num]['likelihood'] = leaf_likelihood a_priori_evidence(candidate_fcns, r_value, go_term_likelihoods) #error_logger.debug("Leaf Likelihoods after a_priori_evidence: ") #for leaf_go_num in candidate_functions: # error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood)) #error_logger.debug("------------- Done computing leaf likelihoods") return {k:go_term_likelihoods[k]['likelihood'] for k in candidate_fcns} def _load_go_ontology(self, go_file, go_format='oboxml'): """ """ if go_format == 'oboxml': obo_from_gzip = gzip.open(go_file, 'rb') # Ontology aspect can be one of: # [u'molecular_function', u'cellular_component', u'biological_process'] graph = self.evidence_ontology.populate_from_go_obo_xml(\ obo_file_buffer=obo_from_gzip, ontology_aspect='molecular_function') obo_from_gzip.close() elif go_format == 'biobayesgraph': self.evidence_ontology.import_from_graphml(go_file)