def orthologs_iterator(fo, version): """Ortholog node and edge iterator""" species_list = config['bel_resources'].get('species_list', []) fo.seek(0) with gzip.open(fo, 'rt') as f: for line in f: edge = json.loads(line) if 'metadata' in edge: source = edge['metadata']['source'] continue if 'ortholog' in edge: edge = edge['ortholog'] subj_tax_id = edge['subject']['tax_id'] obj_tax_id = edge['object']['tax_id'] # Skip if species not listed in species_list if species_list and subj_tax_id and subj_tax_id not in species_list: continue if species_list and obj_tax_id and obj_tax_id not in species_list: continue # Converted to ArangoDB legal chars for _key subj_key = arangodb.arango_id_to_key(edge['subject']['id']) subj_id = edge['subject']['id'] # Converted to ArangoDB legal chars for _key obj_key = arangodb.arango_id_to_key(edge['object']['id']) obj_id = edge['object']['id'] # Subject node yield (arangodb.ortholog_nodes_name, { '_key': subj_key, 'name': subj_id, 'tax_id': edge['subject']['tax_id'], 'source': source, 'version': version }) # Object node yield (arangodb.ortholog_nodes_name, { '_key': obj_key, 'name': obj_id, 'tax_id': edge['object']['tax_id'], 'source': source, 'version': version }) arango_edge = { '_from': f"{arangodb.ortholog_nodes_name}/{subj_key}", '_to': f"{arangodb.ortholog_nodes_name}/{obj_key}", '_key': bel.utils._create_hash(f'{subj_id}>>{obj_id}'), 'type': 'ortholog_to', 'source': source, 'version': version, } yield (arangodb.ortholog_edges_name, arango_edge)
def get_equivalents(term_key: str) -> Mapping[str, List[Mapping[str, Any]]]: """Get equivalents given term key Args: term_key: namespace:id - may be a primary, alt_key, or obsolete_key Returns: Mapping[str, List[Mapping[str, Any]]]: e.g. {"equivalents": [{'term_key': 'HGNC:5', 'namespace': 'HGNC', 'primary': False}]} """ try: term = get_term(term_key) if term: term_dbkey = arango_id_to_key(term.key) else: term_dbkey = None if term_dbkey: query = f""" FOR vertex, edge IN 1..5 ANY 'equivalence_nodes/{term_dbkey}' equivalence_edges OPTIONS {{bfs: true, uniqueVertices : 'global'}} RETURN DISTINCT {{ term_key: vertex.key, namespace: vertex.namespace, primary: vertex.primary }} """ docs = list(resources_db.aql.execute(query)) return {"equivalents": docs} else: return {"equivalents": [], "errors": [f"Unexpected error"]} except Exception as e: logger.exception(f"Problem getting term equivalents for {term_key} msg: {e}") return {"equivalents": [], "errors": [f"Unexpected error {e}"]}
def orthologs_iterator(fo, version): """Ortholog node and edge iterator""" species_list = config["bel_resources"].get("species_list", []) fo.seek(0) with gzip.open(fo, "rt") as f: for line in f: edge = json.loads(line) if "metadata" in edge: source = edge["metadata"]["source"] continue if "ortholog" in edge: edge = edge["ortholog"] subj_tax_id = edge["subject"]["tax_id"] obj_tax_id = edge["object"]["tax_id"] # Skip if species not listed in species_list if species_list and subj_tax_id and subj_tax_id not in species_list: continue if species_list and obj_tax_id and obj_tax_id not in species_list: continue # Converted to ArangoDB legal chars for _key subj_key = arangodb.arango_id_to_key(edge["subject"]["id"]) subj_id = edge["subject"]["id"] # Converted to ArangoDB legal chars for _key obj_key = arangodb.arango_id_to_key(edge["object"]["id"]) obj_id = edge["object"]["id"] # Subject node yield ( arangodb.ortholog_nodes_name, { "_key": subj_key, "name": subj_id, "tax_id": edge["subject"]["tax_id"], "source": source, "version": version, }, ) # Object node yield ( arangodb.ortholog_nodes_name, { "_key": obj_key, "name": obj_id, "tax_id": edge["object"]["tax_id"], "source": source, "version": version, }, ) arango_edge = { "_from": f"{arangodb.ortholog_nodes_name}/{subj_key}", "_to": f"{arangodb.ortholog_nodes_name}/{obj_key}", "_key": bel.utils._create_hash(f"{subj_id}>>{obj_id}"), "type": "ortholog_to", "source": source, "version": version, } yield (arangodb.ortholog_edges_name, arango_edge)
def terms_iterator_for_arangodb(fo, version): species_list = config["bel_resources"].get("species_list", []) fo.seek(0) with gzip.open(fo, "rt") as f: for line in f: term = json.loads(line) # skip if not term record (e.g. is a metadata record) if "term" not in term: continue term = term["term"] species_id = term.get("species_id", None) # Skip if species not listed in species_list if species_list and species_id and species_id not in species_list: continue source = term["namespace"] term_id = term["id"] term_key = arangodb.arango_id_to_key(term_id) (ns, val) = term_id.split(":", maxsplit=1) # Add primary ID node yield ( arangodb.equiv_nodes_name, { "_key": term_key, "name": term_id, "primary": True, "namespace": ns, "source": source, "version": version, }, ) # Create Alt ID nodes/equivalences (to support other database equivalences using non-preferred Namespace IDs) if "alt_ids" in term: for alt_id in term["alt_ids"]: # log.info(f'Added {alt_id} equivalence') alt_id_key = arangodb.arango_id_to_key(alt_id) yield ( arangodb.equiv_nodes_name, { "_key": alt_id_key, "name": alt_id, "namespace": ns, "source": source, "version": version, }, ) arango_edge = { "_from": f"{arangodb.equiv_nodes_name}/{term_key}", "_to": f"{arangodb.equiv_nodes_name}/{alt_id_key}", "_key": bel.utils._create_hash(f"{term_id}>>{alt_id}"), "type": "equivalent_to", "source": source, "version": version, } yield (arangodb.equiv_edges_name, arango_edge) # Cross-DB equivalences if "equivalences" in term: for eqv in term["equivalences"]: (ns, val) = eqv.split(":", maxsplit=1) eqv_key = arangodb.arango_id_to_key(eqv) yield ( arangodb.equiv_nodes_name, { "_key": eqv_key, "name": eqv, "namespace": ns, "source": source, "version": version, }, ) arango_edge = { "_from": f"{arangodb.equiv_nodes_name}/{term_key}", "_to": f"{arangodb.equiv_nodes_name}/{eqv_key}", "_key": bel.utils._create_hash(f"{term_id}>>{eqv}"), "type": "equivalent_to", "source": source, "version": version, } yield (arangodb.equiv_edges_name, arango_edge)
def load_orthologs(fo: IO, metadata: dict, force: bool = False, resource_download_url: Optional[str] = None): """Load orthologs into ArangoDB Args: fo: file obj - orthologs file metadata: dict containing the metadata for orthologs """ result = {"state": "Succeeded", "messages": []} statistics = { "entities_count": 0, "orthologous_pairs": defaultdict(lambda: defaultdict(int)) } version = metadata["version"] source = metadata["name"] metadata_key = metadata["name"] prior_metadata = resources_metadata_coll.get(metadata_key) try: prior_version = prior_metadata.get("version", "") prior_entity_count = prior_metadata["statistics"].get( "entities_count", 0) except Exception: prior_entity_count = 0 prior_version = "" if force or prior_version != version: arangodb.batch_load_docs(resources_db, orthologs_iterator(fo, version, statistics), on_duplicate="update") else: msg = f"NOTE: This orthology dataset {source} at version {version} is already loaded and the 'force' option was not used" result["messages"].append(msg) return result logger.info( f"Loaded orthologs, source: {source} count: {statistics['entities_count']}", source=source) if prior_entity_count > statistics["entities_count"]: logger.error( f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries" ) result["state"] = "Failed" msg = f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries" result["messages"].append(msg) return result remove_old_db_entries(source, version=version) # Add metadata to resource metadata collection metadata["_key"] = arangodb.arango_id_to_key(source) # Using side effect to get statistics from orthologs_iterator on purpose metadata["statistics"] = copy.deepcopy(statistics) if resource_download_url is not None: metadata["resource_download_url"] = resource_download_url resources_metadata_coll.insert(metadata, overwrite=True) result["messages"].append( f'Loaded {statistics["entities_count"]} ortholog sets into arangodb') return result
def orthologs_iterator(fo, version, statistics: Mapping): """Ortholog node and edge iterator NOTE: the statistics dict works as a side effect since it is passed as a reference!!! """ species_list = settings.BEL_FILTER_SPECIES fo.seek(0) for line in fo: edge = json.loads(line) if "metadata" in edge: source = edge["metadata"]["name"] continue if "ortholog" in edge: edge = edge["ortholog"] subject_key = edge["subject_key"] subject_species_key = edge["subject_species_key"] object_key = edge["object_key"] object_species_key = edge["object_species_key"] # Skip if any values are missing if any([ not val for val in [ subject_key, subject_species_key, object_key, object_species_key ] ]): continue # Skip if species_key not listed in species_list if species_list and (subject_species_key not in species_list or object_species_key not in species_list): continue # Simple lexical sorting (e.g. not numerical) to ensure 1 entry per pair if subject_key > object_key: subject_key, subject_species_key, object_key, object_species_key = ( object_key, object_species_key, subject_key, subject_species_key, ) # Convert to ArangoDB legal chars for arangodb _key subject_db_key = arangodb.arango_id_to_key(subject_key) object_db_key = arangodb.arango_id_to_key(object_key) # Subject node yield ( ortholog_nodes_name, { "_key": subject_db_key, "key": subject_key, "species_key": subject_species_key, "source": source, "version": version, }, ) # Object node yield ( ortholog_nodes_name, { "_key": object_db_key, "key": object_key, "species_key": object_species_key, "source": source, "version": version, }, ) arango_edge = { "_from": f"{ortholog_nodes_name}/{subject_db_key}", "_to": f"{ortholog_nodes_name}/{object_db_key}", "_key": bel.core.utils._create_hash(f"{subject_key}>>{object_key}"), "type": "ortholog_to", "source": source, "version": version, } statistics["entities_count"] += 1 statistics["orthologous_pairs"][subject_species_key][ object_species_key] += 1 statistics["orthologous_pairs"][object_species_key][ subject_species_key] += 1 yield (arangodb.ortholog_edges_name, arango_edge)
def terms_iterator_for_arangodb(f: IO, version: str): """Generator for loading namespace terms into arangodb""" species_list = settings.BEL_FILTER_SPECIES f.seek(0) for line in f: term = json.loads(line) # skip if not term record (e.g. is a metadata record) if "term" not in term: continue term = term["term"] term_key = term["key"] namespace = term["namespace"] # Skip if species not listed in config species_list species_key = term.get("species_key", None) if species_list and species_key and species_key not in species_list: continue # Can't use original key formatted for Arangodb as some keys are longer than allowed (_key < 255 chars) term_db_key = arango_id_to_key(term_key) term["_key"] = term_db_key term["version"] = version # Add term record to terms collection yield (terms_coll_name, term) # Add primary ID node yield ( equiv_nodes_name, { "_key": term_db_key, "key": term["key"], # BEL Key - ns:id "primary": True, "namespace": namespace, "source": namespace, "version": version, }, ) # Create Alt ID nodes/equivalences (to support other database equivalences using non-preferred Namespace IDs) if "alt_keys" in term: for alt_key in term["alt_keys"]: # logger.info(f'Added {alt_id} equivalence') alt_db_key = arango_id_to_key(alt_key) yield ( equiv_nodes_name, { "_key": alt_db_key, "key": alt_key, "namespace": alt_key.split(":", 1)[0], "source": namespace, "version": version, }, ) # Ensure only one edge per pair if term_db_key < alt_db_key: from_ = term_db_key to_ = alt_db_key else: from_ = alt_db_key to_ = term_db_key # Add edges for alt_keys arango_edge = { "_from": f"{equiv_nodes_name}/{from_}", "_to": f"{equiv_nodes_name}/{to_}", "_key": arango_id_to_key(f"{from_}>>{to_}"), "type": "equivalent_to", "source": namespace, "version": version, } yield (equiv_edges_name, arango_edge) # Cross-Namespace equivalences if "equivalence_keys" in term: for eqv_key in term["equivalence_keys"]: eqv_db_key = arango_id_to_key(eqv_key) equiv_node = ( equiv_nodes_name, { "_key": eqv_db_key, "key": eqv_key, "namespace": eqv_key.split(":", 1)[0], "source": namespace, "version": version, }, ) yield equiv_node # Ensure only one edge per pair if term_db_key < eqv_db_key: from_ = term_db_key to_ = eqv_db_key else: from_ = eqv_db_key to_ = term_db_key equiv_edge = ( equiv_edges_name, { "_from": f"{equiv_nodes_name}/{from_}", "_to": f"{equiv_nodes_name}/{to_}", "_key": arango_id_to_key(f"{from_}>>{to_}"), "type": "equivalent_to", "source": namespace, "version": version, }, ) yield equiv_edge
def terms_iterator_for_arangodb(fo, version): species_list = config['bel_resources'].get('species_list', []) fo.seek(0) with gzip.open(fo, 'rt') as f: for line in f: term = json.loads(line) # skip if not term record (e.g. is a metadata record) if 'term' not in term: continue term = term['term'] species_id = term.get('species_id', None) # Skip if species not listed in species_list if species_list and species_id and species_id not in species_list: continue source = term['namespace'] term_id = term['id'] term_key = arangodb.arango_id_to_key(term_id) (ns, val) = term_id.split(':', maxsplit=1) # Add primary ID node yield (arangodb.equiv_nodes_name, { '_key': term_key, 'name': term_id, 'primary': True, 'namespace': ns, 'source': source, 'version': version }) # Create Alt ID nodes/equivalences (to support other database equivalences using non-preferred Namespace IDs) if 'alt_ids' in term: for alt_id in term['alt_ids']: # log.info(f'Added {alt_id} equivalence') alt_id_key = arangodb.arango_id_to_key(alt_id) yield (arangodb.equiv_nodes_name, { '_key': alt_id_key, 'name': alt_id, 'namespace': ns, 'source': source, 'version': version }) arango_edge = { '_from': f"{arangodb.equiv_nodes_name}/{term_key}", '_to': f"{arangodb.equiv_nodes_name}/{alt_id_key}", '_key': bel.utils._create_hash(f'{term_id}>>{alt_id}'), 'type': 'equivalent_to', 'source': source, 'version': version, } yield (arangodb.equiv_edges_name, arango_edge) # Cross-DB equivalences if 'equivalences' in term: for eqv in term['equivalences']: (ns, val) = eqv.split(':', maxsplit=1) eqv_key = arangodb.arango_id_to_key(eqv) yield (arangodb.equiv_nodes_name, { '_key': eqv_key, 'name': eqv, 'namespace': ns, 'source': source, 'version': version }) arango_edge = { '_from': f"{arangodb.equiv_nodes_name}/{term_key}", '_to': f"{arangodb.equiv_nodes_name}/{eqv_key}", '_key': bel.utils._create_hash(f'{term_id}>>{eqv}'), 'type': 'equivalent_to', 'source': source, 'version': version, } yield (arangodb.equiv_edges_name, arango_edge)