def get_gene_ontology_access(self): """ Loads all of the relations between the UNIPROTs and GOs as one giant dictionary """ uniprots_without_gene_ontology_terms = 0 log.info('Starting GO matrix mapping starting from %s uniprots', len(self.InitSet)) for uniprot_bulbs_id in self.InitSet: uniprot_specific_gos = [] up_node = DatabaseGraph.UNIPORT.get(uniprot_bulbs_id) # Yeah, this is a dangerous one, we need to avoid failure on that one self.UP_Names[uniprot_bulbs_id] = [up_node.ID, up_node.displayName] attached_go_nodes = up_node.bothV("is_go_annotation") if attached_go_nodes: for go_node in attached_go_nodes: if go_node.Namespace in self.go_namespace_filter: go_node_bulbs_id = get_bulbs_id(go_node) uniprot_specific_gos.append(go_node_bulbs_id) self.GO2UP[go_node_bulbs_id].append(uniprot_bulbs_id) self.SeedSet.add(go_node_bulbs_id) if not uniprot_specific_gos: uniprots_without_gene_ontology_terms += 1 log.debug("UP without GO was found. UP bulbs_id: %s, \t name: %s", uniprot_bulbs_id, self.UP_Names[uniprot_bulbs_id]) self.UPs_without_GO.add(uniprot_bulbs_id) else: self.UP2GO_Dict[uniprot_bulbs_id] = copy(uniprot_specific_gos) log.info('total number of UPs without a go_node annotation: %s out of %s', uniprots_without_gene_ontology_terms, len(self.InitSet))
def compute_uniprot_attachments(self): """ Computes the dictionary of attachments between the reached_uniprots_bulbs_id_list and Reactome proteins """ log.info('attaching reactome proteins to uniprot nodes') uniprot_attachments_counter = 0 reactome_attachments_counter = 0 for uniprot_bulbs_id in self.reached_uniprots_bulbs_id_list: bulbs_node = DatabaseGraph.UNIPORT.get(uniprot_bulbs_id) reactome_bulbs_id_generator = bulbs_node.bothV("is_same") if reactome_bulbs_id_generator is not None: self.Uniprot_attachments[uniprot_bulbs_id] = [] for uniprot_alias in reactome_bulbs_id_generator: self.Uniprot_attachments[uniprot_bulbs_id].append(get_bulbs_id(uniprot_alias)) uniprot_attachments_counter += 1 reactome_attachments_counter += len(self.Uniprot_attachments[uniprot_bulbs_id]) log.debug('attached %s Reactome proteins to the node %s', len(self.Uniprot_attachments[uniprot_bulbs_id]), uniprot_bulbs_id) else: log.debug('No attachment for the node %s', uniprot_bulbs_id) log.info('Attached %s reactome protein nodes to %s / %s uniprot nodes', reactome_attachments_counter, uniprot_attachments_counter, len(self.reached_uniprots_bulbs_id_list))
def insert_meta_objects(bulbs_graph_class, meta_id_2_property_dict): """ Inserst a Meta-Object (I.e. any physical entity or collection thereof) as a member of a bulbs class and pumping the bioflow information from the property bioflow :param bulbs_graph_class: :param meta_id_2_property_dict: """ total_properties = len(meta_id_2_property_dict) config = bulbs_graph_class.client.config element_type = bulbs_graph_class.element_class.get_element_type(config) for i, (meta_name, property_dict) in enumerate(meta_id_2_property_dict.iteritems()): if i * 20 % total_properties < 21: log.info('insert %s: %s out of %s', element_type, i, total_properties) primary = bulbs_graph_class.create( ID=meta_name, displayName=property_dict['displayName'], localization=property_dict['cellularLocation'], main_connex=False) if meta_name in Leg_ID_Filter: ForbiddenIDs.append(get_bulbs_id(primary)) memoization_dict[meta_name] = primary insert_minimal_annotations( memoization_dict[meta_name], meta_id_2_property_dict[meta_name]['references']) if 'cellularLocation' in meta_id_2_property_dict[meta_name].keys(): secondary = memoization_dict[meta_id_2_property_dict[meta_name] ['cellularLocation']] DatabaseGraph.is_localized.create(primary, secondary, costum_from=primary.ID, costum_to=secondary.ID) if 'modification' in meta_id_2_property_dict[meta_name].keys(): for modification in meta_id_2_property_dict[meta_name][ 'modification']: if 'location' in modification.keys( ) and 'modification' in modification.keys(): located_modification = DatabaseGraph.ModificationFeature.create( ID=modification['ID'], type="post-translational_Mod", location=modification['location'], displayName=modification['modification']) DatabaseGraph.is_able_to_modify.create( primary, located_modification, costum_from=primary.ID, costum_to=located_modification.ID)
def pull_up_acc_nums_from_reactome(): """ Attempts to retrieve accession numbers nums from the neo4j database :return: dict that maps acnums to nodes in the database to which they point (Reactome proteins) :raise Exception: if the generator that performs a lookup of existing uniprot acnum annotation nodes is null (Reactome wasn't imported yet) or a wrong uniprot is crosslinked with a wrong reactome """ acc_num_annot_nodes = DatabaseGraph.AnnotNode.index.lookup(ptype='UniProt') if acc_num_annot_nodes is None: raise Exception( "Reactome was not loaded or contains no acc_num cross-references to Uniprot" ) acc_num_dict = {} # acnum to AnnotNode for annotation_node in acc_num_annot_nodes: if annotation_node is not None: annot_obj = DatabaseGraph.vertices.get( get_bulbs_id(annotation_node)) acc_num_dict[str(annot_obj.payload)] = annot_obj if len(acc_num_dict) < 10: raise Exception( "Reactome was not loaded or contains no acc_num cross-references to Uniprot" ) reactome_proteins = {} for acc_num in acc_num_dict.keys(): reactome_proteins_generator = acc_num_dict[acc_num].bothV() reactome_proteins[acc_num] = [] if reactome_proteins_generator is not None: for vertex in reactome_proteins_generator: if vertex is not None: reactome_proteins[acc_num].append(vertex) if reactome_proteins[acc_num] != 1: log.debug( 'Cross-linking reactome v.s. acc_num %s mapped to %s proteins', acc_num, len(reactome_proteins)) log.info('Cross-linked %s proteins from reactome v.s. Uniprot', len(reactome_proteins)) return reactome_proteins
def get_gene_ontology_structure(self): """ Loads all of the relations between the GOs that are generalisation of the seedList GOs and that are withing the types specified in go_namespace_filter """ visited_set = set() seeds_list = copy(list(self.SeedSet)) log.info('Starting gene ontology structure retrieval from the set of %s seeds', len(self.SeedSet)) while seeds_list: node_id = seeds_list.pop() visited_set.add(node_id) local_uniprot_list = [] local_regulation_list = [] local_up_regulation_list = [] local_down_regulation_list = [] gene_ontology_node = DatabaseGraph.GOTerm.get(node_id) self.GO_Names[node_id] = str(gene_ontology_node.displayName) self.GO_Legacy_IDs[node_id] = str(gene_ontology_node.ID) self.rev_GO_IDs[str(gene_ontology_node.ID)] = node_id for relation_type in chain(self._GOUpTypes, self._GORegTypes): related_go_nodes = gene_ontology_node.outV(relation_type) if not related_go_nodes: continue # skip in case GO Node has no outgoing relations to other GO nodes for go_node in related_go_nodes: if go_node.Namespace not in self.go_namespace_filter: continue node_bulbs_id = get_bulbs_id(go_node) if node_bulbs_id not in visited_set: seeds_list.append(node_bulbs_id) if relation_type in self._GOUpTypes: local_uniprot_list.append(node_bulbs_id) else: local_regulation_list.append(node_bulbs_id) rev_generator = gene_ontology_node.inV(relation_type) if not rev_generator: continue for go_node in rev_generator: if go_node.Namespace not in self.go_namespace_filter: continue node_bulbs_id = get_bulbs_id(go_node) if relation_type in self._GOUpTypes: local_down_regulation_list.append(node_bulbs_id) else: local_up_regulation_list.append(node_bulbs_id) self.Reachable_nodes_dict[node_id] = ( list(set(local_uniprot_list)), list(set(local_regulation_list)), list(set(local_down_regulation_list)), list(set(local_up_regulation_list))) self.All_GOs = list(visited_set) self.Num2GO = dict((i, val) for i, val in enumerate(self.All_GOs)) self.GO2Num = dict((val, i) for i, val in enumerate(self.All_GOs))
def map_rows_to_names(self): """ Maps Node Database IDs, Legacy IDs, display names and types to matrix row/column indexes; """ def request_location(_location_buffer_dict, location): """ Just a Buffered lookup of location, since the number of cellular location is relatively small (~80), it makes sense to buffer the IOs on it. Normally should be moved out as a buffering decorator :param _location_buffer_dict: Buffered location :param location: location Node Legacy ID we are willing to verify :return: displayName of the requested location """ location = str(location) if location in _location_buffer_dict.keys(): return _location_buffer_dict[location] else: generator = DatabaseGraph.Location.index.lookup(ID=location) if generator is not None: for elt in generator: _location_buffer_dict[location] = str(elt.displayName) return str(elt.displayName) ####################################################################### counter = 0 location_buffer_dict = {} self.bulbs_id_2_matrix_index = {} self.matrix_index_2_bulbs_id = {} log.info('nodes in Highest Set: %s', len(self.Highest_Set)) for bulbs_node_id in self.Highest_Set: self.bulbs_id_2_matrix_index[bulbs_node_id] = counter self.matrix_index_2_bulbs_id[counter] = bulbs_node_id node = DatabaseGraph.vertices.get(bulbs_node_id) self.bulbs_id_2_display_name[bulbs_node_id] = node.displayName self.bulbs_id2_node_type[bulbs_node_id] = node.element_type self.bulbs_id_2_legacy_id[bulbs_node_id] = node.ID if node.element_type == "UNIPROT": # TODO: there is a problem: there is no UNIPROT nodes reached during the expansion. self.reached_uniprots_bulbs_id_list.append(bulbs_node_id) self.uniprot_matrix_index_list.append(counter) if node.localization is not None: self.bulbs_id_2_localization[bulbs_node_id] = request_location( location_buffer_dict, node.localization) counter += 1 self.all_uniprots_bulbs_id_list += self.reached_uniprots_bulbs_id_list self.reached_uniprots_bulbs_id_list = list(set(self.reached_uniprots_bulbs_id_list)) log.info("reached uniprots: %s", len(self.reached_uniprots_bulbs_id_list)) up_generator = stable_get_all(DatabaseGraph.UNIPORT) if up_generator: for up_node in up_generator: bulbs_node_id = get_bulbs_id(up_node) if bulbs_node_id not in self.reached_uniprots_bulbs_id_list: self.all_uniprots_bulbs_id_list.append(bulbs_node_id) self.bulbs_id_2_display_name[bulbs_node_id] = up_node.displayName self.bulbs_id2_node_type[bulbs_node_id] = up_node.element_type self.bulbs_id_2_legacy_id[bulbs_node_id] = up_node.ID self.all_uniprots_bulbs_id_list = list(set(self.all_uniprots_bulbs_id_list)) log.info("analyzable uniprots: %s", len(self.all_uniprots_bulbs_id_list))