def testGraphLoad(self): """ Loads graph """ obo_graph = BioBayesGraph() obo_graph.import_from_graphml(os.path.dirname(os.path.realpath(__file__)) + "/example_data/MF.bbg") for k in self.node_prop_types.iterkeys(): self.assertTrue((k in obo_graph.g.vertex_properties), msg="%s not imported correctly."%k)
class EvidenceProcessor(object): ''' This is the SIFTER 2.0 evidence handling method. ''' def __init__(self, processor_settings): ''' For SIFTER 2.0, the molecular function gene ontology is loaded into a graph. ''' self.evidence_ontology = BioBayesGraph() self._load_go_ontology(go_file=processor_settings['go_file'], go_format=processor_settings['go_format']) def parse_evidence(self, evidence_file, evidence_format, evidence_constraints): ''' Routing function to parse evidence from different format sources. Doesn't process the evidence; only parses the file. ''' if evidence_format == 'pli': go_ev_set = pli_parser.parser(\ evidence_file=evidence_file, evidence_constraints=evidence_constraints) return go_ev_set else: raise Exception, "Evidence format requested isn't supported." def process_evidence(self, evidence_set, evidence_constraints): '''os.devnull Using the parsed evidence, this places the evidence set and modifies the gene ontology graph in the SIFTER 2.0 way. ''' # For each protein in the evidence set, store the annotation # into the evidence graph go_terms = set([]) for pid_json, annot_json in evidence_set.iteritems(): p_ev_set = json.loads(annot_json['evidence_set']) for go_term, moc in p_ev_set: go_terms.add(go_term) annotated_term_nodes = {} for go_term in go_terms: g_node = self.evidence_ontology.get_node_by_name(go_term) if g_node is None: raise Exception, "GO term, %s doesn't seem to be named in your ontology."%go_term annotated_term_nodes[go_term] = self.evidence_ontology.get_node_by_name(go_term) go_subdag = self._get_ontology_subdag(annotated_term_nodes=annotated_term_nodes) #self._visualize_ontology_subdag(go_subdag, "./sub_dag.pdf") processed_ev_set = {} # Now for each protein, add the graphical model evidence for pid_json, annot_json in evidence_set.iteritems(): p_ev_set = json.loads(annot_json['evidence_set']) processed_ev_set[pid_json] = self._distribute_evidence_to_subdag_leaves(\ sub_dag=go_subdag, evidence_constraints=evidence_constraints, protein_evidence_set=p_ev_set) return processed_ev_set def _get_ontology_subdag(self, annotated_term_nodes): """ Given evidence_set, returns a filtered subgraph of self.evidence_ontology that only contains those nodes or their ancestors. """ # For each annotated node, traverse to the root node of the ontology # to include all its less-specific terms all_term_nodes = set([]) for go_term, annot_term in annotated_term_nodes.iteritems(): #print "Tracing:", annot_term, "which is", go_term for generic_term in self._trace_to_ontology_root(self.evidence_ontology.g.vertex(annot_term)): #print "is_a", self.evidence_ontology.g.vertex_properties['go_id'][generic_term],\ # "i.e.", self.evidence_ontology.g.vertex_properties['go_name'][generic_term] all_term_nodes.add(generic_term) sub_dag = graph_tool.GraphView(self.evidence_ontology.g, vfilt=lambda v: v in all_term_nodes) return sub_dag def _trace_to_ontology_root(self, cur_node): """ Generator to recursively visit all nodes on each path from a node up to the root node. """ #print "Graph node:", cur_node yield cur_node for edge_in in cur_node.out_edges(): if self.evidence_ontology.g.edge_properties['edge_type'][edge_in] == 'is_a': for n in self._trace_to_ontology_root(edge_in.target()): yield n def _get_top_node(self, sub_dag): """ Gives the root node of the sub dag. """ for c in sub_dag.vertices(): if c.out_degree() == 0: return c return None def _get_leaves_from_node(self, sub_dag, top_node): descendant_leaves = set() #print "Top node is: %s"%str(top_node) #print "Successors: %s"%str(godag.successors(top_node)) for c in top_node.in_neighbours(): #print "Out degree is: %i"%godag.out_degree(c) if not(c.in_degree() == 0): descendant_leaves = descendant_leaves.union(self._get_leaves_from_node(sub_dag, c)) else: descendant_leaves.add(c) return descendant_leaves def _visualize_ontology_subdag(self, sub_dag, output_file): """ Draws sub-dag to file. """ #http://projects.skewed.de/graph-tool/doc/search_module.html?highlight=leaf #gprops={'forcelabels':'true'}, #vprops={'label':sub_dag.vertex_properties['go_id'],}, #'xlabel':sub_dag.vertex_properties['go_name']}, #vcolor='#00FF00' pos = graph_tool.draw.graphviz_draw(sub_dag, size=(30,30), ratio="fill", layout="dot", vprops={'label':sub_dag.vertex_properties['go_id'],}, #'xlabel':sub_dag.vertex_properties['go_name']}, output="/dev/null/tmp.pdf") return graph_tool.draw.graph_draw(sub_dag, pos=pos, vertex_text=sub_dag.vertex_properties['go_id'], vertex_font_size=8, nodesfirst=True, #vertex_shape="double_circle", vertex_fill_color="#729fcf", vertex_pen_width=3, output=output_file) def _distribute_evidence_to_subdag_leaves(self, sub_dag, protein_evidence_set, evidence_constraints): """ Propagates the evidence in protein_evidence_set over sub_dag and returns a dictionary of {go_term: probability} by distributing the evidence in the SIFTER 2.0 way. """ def prob_or(p1, p2): return 1.0 - (1.0 - p1) * (1 - p2) def binomial(n, k): bc = [1 for i in range(0, k + 1)] for j in range(1, n - k + 1): for i in range(1, k + 1): bc[i] = bc[i - 1] + bc[i] return bc[k] def probability_of_observing_k_nodes(r_value, k): if (k == 0): return 1.0 / r_value prob = 0 for i in range(1, k + 1): prob = prob + binomial(k, i) * 1 / (r_value ** (i)) return prob def calculate_R_value(total_num_leaves): r_value = 1.0 / (2 ** (1.0 / total_num_leaves) - 1) #error_logger.debug("r_value: %f" % r_value) return r_value #print protein_evidence_set # Candidate function set = leaves starting from the root. root_node = self._get_top_node(sub_dag) candidate_fcns = [sub_dag.vertex_properties['go_id'][k] \ for k in self._get_leaves_from_node(sub_dag, root_node)] # Set initial probabilities in DAG for evidence provided by this protein go_term_likelihoods = {sub_dag.vertex_properties['go_id'][k]: \ {'likelihood':0, 'dag_vertex_id':int(k)} for k in sub_dag.vertices()} for go_term, ev_method in protein_evidence_set: dag_node = self.evidence_ontology.get_node_by_name(go_term) go_term_likelihoods[go_term]['likelihood'] = \ prob_or(go_term_likelihoods[go_term]['likelihood'], evidence_constraints[ev_method]) #error_logger.debug("Used %i piece(s) of evidence (%s) to set initial belief to %f for %s" % (len(ev_methods), str(ev_methods), dag_node_descriptor.likelihood, go_term)) # Now for any that are ancestral, propagate the probabilities down in a wonky way r_value = calculate_R_value(len(candidate_fcns)) for go_term, ev_method in protein_evidence_set: dag_node = sub_dag.vertex(go_term_likelihoods[go_term]['dag_vertex_id']) # Skip if is leaf if dag_node.out_degree() == 0: continue descendant_leaf_set = self._get_leaves_from_node(sub_dag, dag_node) #error_logger.debug(" For: %s leaves descendant from this node: %s" % (go_num, descendant_leaf_set)) # Propagate evidence to leaf nodes parent_prob = go_term_likelihoods[go_term]['likelihood'] transmission_coeff = probability_of_observing_k_nodes(r_value, 0) \ / probability_of_observing_k_nodes(r_value, len(descendant_leaf_set)) for leaf_node in descendant_leaf_set: old_likelihood = go_term_likelihoods[sub_dag.vertex_properties['go_id'][k]]['likelihood'] new_likelihood = prob_or(old_likelihood, parent_prob * transmission_coeff) # Store update go_term_likelihoods[sub_dag.vertex_properties['go_id'][k]]['likelihood'] = new_likelihood #error_logger.debug(" Distributed prob to: %s. Child's likelihood went from: %s to %s" % (leaf_node.goid, old_likelihood, evidence_go_num_dict[leaf_node.goid].likelihood)) # This step is performed in Java code, and has the effect of making all # likelihoods non-zero, though underlying reason for doing this is unknown. # error_logger.debug("Leaf Likelihoods before synchronizing: ") # for leaf_go_num in candidate_functions: # error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood)) def synchronize_likelihoods(leaf_go_nums, r_value, evidence_nodes): prob = 0.0 # Calculate probability of observing subset of power set having size of leaf set # Note, this isn't equivalent to: leaf_subset_prior = # probability_of_observing_k_nodes(len(leaf_go_nums)) num_leaves = len(leaf_go_nums) leaf_subset_prior = 0 for i in range(1, num_leaves + 1): leaf_subset_prior = leaf_subset_prior + binomial(num_leaves - 1, i) * 1 / (r_value ** (i)) # Calculate likelihood of ANY leaf likelihood_of_any_leaf = 0.0 for leaf_go_num in leaf_go_nums: likelihood_of_any_leaf = prob_or(likelihood_of_any_leaf, evidence_nodes[leaf_go_num]['likelihood']) # Not entirely sure what's going on here: # Translated from "synchronizeLikelihoods() in PFunGODAG.java. not_in_a_subset_prior = (1.0 - likelihood_of_any_leaf) * leaf_subset_prior for leaf_go_num in leaf_go_nums: current_likelihood = evidence_nodes[leaf_go_num]['likelihood'] new_likelihood = prob_or(current_likelihood, not_in_a_subset_prior) evidence_nodes[leaf_go_num]['likelihood'] = new_likelihood synchronize_likelihoods(candidate_fcns, r_value, go_term_likelihoods) #error_logger.debug("Leaf Likelihoods after synchronizing: ") #for leaf_go_num in candidate_fcns: # error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood)) # Again, this step is performed in Java code and makes all likelihoods # non-zero, though underlying reason for doing this is unknown. def a_priori_evidence(leaf_go_nums, r_value, evidence_nodes): total = 1.0 count_of_unlikely_leaves = 0 total_num_leaves = len(leaf_go_nums) for leaf_go_num in leaf_go_nums: leaf_likelihood = evidence_nodes[leaf_go_num]['likelihood'] if (leaf_likelihood > 0): total = total * leaf_likelihood else: count_of_unlikely_leaves = count_of_unlikely_leaves + 1 if (count_of_unlikely_leaves > 0): rest = (1.0 / (r_value ** total_num_leaves)) / total a = rest ** (1.0 / count_of_unlikely_leaves) for leaf_go_num in leaf_go_nums: leaf_likelihood = evidence_nodes[leaf_go_num]['likelihood'] # For each zero likelihood, we want to fudge factor a bit. if (leaf_likelihood <= 0): leaf_likelihood = a evidence_nodes[leaf_go_num]['likelihood'] = leaf_likelihood a_priori_evidence(candidate_fcns, r_value, go_term_likelihoods) #error_logger.debug("Leaf Likelihoods after a_priori_evidence: ") #for leaf_go_num in candidate_functions: # error_logger.debug("Fcn: %s, Likelihood: %.16f" % (leaf_go_num, evidence_go_num_dict[leaf_go_num].likelihood)) #error_logger.debug("------------- Done computing leaf likelihoods") return {k:go_term_likelihoods[k]['likelihood'] for k in candidate_fcns} def _load_go_ontology(self, go_file, go_format='oboxml'): """ """ if go_format == 'oboxml': obo_from_gzip = gzip.open(go_file, 'rb') # Ontology aspect can be one of: # [u'molecular_function', u'cellular_component', u'biological_process'] graph = self.evidence_ontology.populate_from_go_obo_xml(\ obo_file_buffer=obo_from_gzip, ontology_aspect='molecular_function') obo_from_gzip.close() elif go_format == 'biobayesgraph': self.evidence_ontology.import_from_graphml(go_file)