Beispiel #1
0
def orthologs_iterator(fo, version):
    """Ortholog node and edge iterator"""

    species_list = config['bel_resources'].get('species_list', [])

    fo.seek(0)
    with gzip.open(fo, 'rt') as f:
        for line in f:
            edge = json.loads(line)
            if 'metadata' in edge:
                source = edge['metadata']['source']
                continue

            if 'ortholog' in edge:
                edge = edge['ortholog']
                subj_tax_id = edge['subject']['tax_id']
                obj_tax_id = edge['object']['tax_id']

                # Skip if species not listed in species_list
                if species_list and subj_tax_id and subj_tax_id not in species_list:
                    continue
                if species_list and obj_tax_id and obj_tax_id not in species_list:
                    continue

                # Converted to ArangoDB legal chars for _key
                subj_key = arangodb.arango_id_to_key(edge['subject']['id'])
                subj_id = edge['subject']['id']

                # Converted to ArangoDB legal chars for _key
                obj_key = arangodb.arango_id_to_key(edge['object']['id'])
                obj_id = edge['object']['id']

                # Subject node
                yield (arangodb.ortholog_nodes_name, {
                    '_key': subj_key,
                    'name': subj_id,
                    'tax_id': edge['subject']['tax_id'],
                    'source': source,
                    'version': version
                })
                # Object node
                yield (arangodb.ortholog_nodes_name, {
                    '_key': obj_key,
                    'name': obj_id,
                    'tax_id': edge['object']['tax_id'],
                    'source': source,
                    'version': version
                })

                arango_edge = {
                    '_from': f"{arangodb.ortholog_nodes_name}/{subj_key}",
                    '_to': f"{arangodb.ortholog_nodes_name}/{obj_key}",
                    '_key': bel.utils._create_hash(f'{subj_id}>>{obj_id}'),
                    'type': 'ortholog_to',
                    'source': source,
                    'version': version,
                }

                yield (arangodb.ortholog_edges_name, arango_edge)
Beispiel #2
0
def get_equivalents(term_key: str) -> Mapping[str, List[Mapping[str, Any]]]:
    """Get equivalents given term key

    Args:
        term_key: namespace:id - may be a primary, alt_key, or obsolete_key

    Returns:
        Mapping[str, List[Mapping[str, Any]]]: e.g. {"equivalents": [{'term_key': 'HGNC:5', 'namespace': 'HGNC', 'primary': False}]}
    """

    try:

        term = get_term(term_key)

        if term:
            term_dbkey = arango_id_to_key(term.key)
        else:
            term_dbkey = None

        if term_dbkey:
            query = f"""
            FOR vertex, edge IN 1..5
                ANY 'equivalence_nodes/{term_dbkey}' equivalence_edges
                OPTIONS {{bfs: true, uniqueVertices : 'global'}}
                RETURN DISTINCT {{
                    term_key: vertex.key,
                    namespace: vertex.namespace,
                    primary: vertex.primary
                }}
            """

            docs = list(resources_db.aql.execute(query))
            return {"equivalents": docs}
        else:
            return {"equivalents": [], "errors": [f"Unexpected error"]}

    except Exception as e:
        logger.exception(f"Problem getting term equivalents for {term_key} msg: {e}")
        return {"equivalents": [], "errors": [f"Unexpected error {e}"]}
Beispiel #3
0
def orthologs_iterator(fo, version):
    """Ortholog node and edge iterator"""

    species_list = config["bel_resources"].get("species_list", [])

    fo.seek(0)
    with gzip.open(fo, "rt") as f:
        for line in f:
            edge = json.loads(line)
            if "metadata" in edge:
                source = edge["metadata"]["source"]
                continue

            if "ortholog" in edge:
                edge = edge["ortholog"]
                subj_tax_id = edge["subject"]["tax_id"]
                obj_tax_id = edge["object"]["tax_id"]

                # Skip if species not listed in species_list
                if species_list and subj_tax_id and subj_tax_id not in species_list:
                    continue
                if species_list and obj_tax_id and obj_tax_id not in species_list:
                    continue

                # Converted to ArangoDB legal chars for _key
                subj_key = arangodb.arango_id_to_key(edge["subject"]["id"])
                subj_id = edge["subject"]["id"]

                # Converted to ArangoDB legal chars for _key
                obj_key = arangodb.arango_id_to_key(edge["object"]["id"])
                obj_id = edge["object"]["id"]

                # Subject node
                yield (
                    arangodb.ortholog_nodes_name,
                    {
                        "_key": subj_key,
                        "name": subj_id,
                        "tax_id": edge["subject"]["tax_id"],
                        "source": source,
                        "version": version,
                    },
                )
                # Object node
                yield (
                    arangodb.ortholog_nodes_name,
                    {
                        "_key": obj_key,
                        "name": obj_id,
                        "tax_id": edge["object"]["tax_id"],
                        "source": source,
                        "version": version,
                    },
                )

                arango_edge = {
                    "_from": f"{arangodb.ortholog_nodes_name}/{subj_key}",
                    "_to": f"{arangodb.ortholog_nodes_name}/{obj_key}",
                    "_key": bel.utils._create_hash(f"{subj_id}>>{obj_id}"),
                    "type": "ortholog_to",
                    "source": source,
                    "version": version,
                }

                yield (arangodb.ortholog_edges_name, arango_edge)
Beispiel #4
0
def terms_iterator_for_arangodb(fo, version):

    species_list = config["bel_resources"].get("species_list", [])

    fo.seek(0)
    with gzip.open(fo, "rt") as f:
        for line in f:
            term = json.loads(line)
            # skip if not term record (e.g. is a metadata record)
            if "term" not in term:
                continue
            term = term["term"]

            species_id = term.get("species_id", None)
            # Skip if species not listed in species_list
            if species_list and species_id and species_id not in species_list:
                continue

            source = term["namespace"]
            term_id = term["id"]
            term_key = arangodb.arango_id_to_key(term_id)

            (ns, val) = term_id.split(":", maxsplit=1)

            # Add primary ID node
            yield (
                arangodb.equiv_nodes_name,
                {
                    "_key": term_key,
                    "name": term_id,
                    "primary": True,
                    "namespace": ns,
                    "source": source,
                    "version": version,
                },
            )

            # Create Alt ID nodes/equivalences (to support other database equivalences using non-preferred Namespace IDs)
            if "alt_ids" in term:
                for alt_id in term["alt_ids"]:
                    # log.info(f'Added {alt_id} equivalence')
                    alt_id_key = arangodb.arango_id_to_key(alt_id)
                    yield (
                        arangodb.equiv_nodes_name,
                        {
                            "_key": alt_id_key,
                            "name": alt_id,
                            "namespace": ns,
                            "source": source,
                            "version": version,
                        },
                    )

                    arango_edge = {
                        "_from": f"{arangodb.equiv_nodes_name}/{term_key}",
                        "_to": f"{arangodb.equiv_nodes_name}/{alt_id_key}",
                        "_key": bel.utils._create_hash(f"{term_id}>>{alt_id}"),
                        "type": "equivalent_to",
                        "source": source,
                        "version": version,
                    }
                    yield (arangodb.equiv_edges_name, arango_edge)

            # Cross-DB equivalences
            if "equivalences" in term:
                for eqv in term["equivalences"]:
                    (ns, val) = eqv.split(":", maxsplit=1)
                    eqv_key = arangodb.arango_id_to_key(eqv)

                    yield (
                        arangodb.equiv_nodes_name,
                        {
                            "_key": eqv_key,
                            "name": eqv,
                            "namespace": ns,
                            "source": source,
                            "version": version,
                        },
                    )

                    arango_edge = {
                        "_from": f"{arangodb.equiv_nodes_name}/{term_key}",
                        "_to": f"{arangodb.equiv_nodes_name}/{eqv_key}",
                        "_key": bel.utils._create_hash(f"{term_id}>>{eqv}"),
                        "type": "equivalent_to",
                        "source": source,
                        "version": version,
                    }
                    yield (arangodb.equiv_edges_name, arango_edge)
Beispiel #5
0
def load_orthologs(fo: IO,
                   metadata: dict,
                   force: bool = False,
                   resource_download_url: Optional[str] = None):
    """Load orthologs into ArangoDB

    Args:
        fo: file obj - orthologs file
        metadata: dict containing the metadata for orthologs
    """

    result = {"state": "Succeeded", "messages": []}

    statistics = {
        "entities_count": 0,
        "orthologous_pairs": defaultdict(lambda: defaultdict(int))
    }

    version = metadata["version"]
    source = metadata["name"]

    metadata_key = metadata["name"]
    prior_metadata = resources_metadata_coll.get(metadata_key)

    try:
        prior_version = prior_metadata.get("version", "")
        prior_entity_count = prior_metadata["statistics"].get(
            "entities_count", 0)

    except Exception:
        prior_entity_count = 0
        prior_version = ""

    if force or prior_version != version:
        arangodb.batch_load_docs(resources_db,
                                 orthologs_iterator(fo, version, statistics),
                                 on_duplicate="update")
    else:
        msg = f"NOTE: This orthology dataset {source} at version {version} is already loaded and the 'force' option was not used"
        result["messages"].append(msg)
        return result

    logger.info(
        f"Loaded orthologs, source: {source}  count: {statistics['entities_count']}",
        source=source)

    if prior_entity_count > statistics["entities_count"]:
        logger.error(
            f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries"
        )

        result["state"] = "Failed"
        msg = f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries"
        result["messages"].append(msg)
        return result

    remove_old_db_entries(source, version=version)

    # Add metadata to resource metadata collection
    metadata["_key"] = arangodb.arango_id_to_key(source)

    # Using side effect to get statistics from orthologs_iterator on purpose
    metadata["statistics"] = copy.deepcopy(statistics)

    if resource_download_url is not None:
        metadata["resource_download_url"] = resource_download_url

    resources_metadata_coll.insert(metadata, overwrite=True)

    result["messages"].append(
        f'Loaded {statistics["entities_count"]} ortholog sets into arangodb')
    return result
Beispiel #6
0
def orthologs_iterator(fo, version, statistics: Mapping):
    """Ortholog node and edge iterator

    NOTE: the statistics dict works as a side effect since it is passed as a reference!!!
    """

    species_list = settings.BEL_FILTER_SPECIES

    fo.seek(0)

    for line in fo:
        edge = json.loads(line)
        if "metadata" in edge:
            source = edge["metadata"]["name"]
            continue

        if "ortholog" in edge:
            edge = edge["ortholog"]

            subject_key = edge["subject_key"]
            subject_species_key = edge["subject_species_key"]
            object_key = edge["object_key"]
            object_species_key = edge["object_species_key"]

            # Skip if any values are missing
            if any([
                    not val for val in [
                        subject_key, subject_species_key, object_key,
                        object_species_key
                    ]
            ]):
                continue

            # Skip if species_key not listed in species_list
            if species_list and (subject_species_key not in species_list
                                 or object_species_key not in species_list):
                continue

            # Simple lexical sorting (e.g. not numerical) to ensure 1 entry per pair
            if subject_key > object_key:
                subject_key, subject_species_key, object_key, object_species_key = (
                    object_key,
                    object_species_key,
                    subject_key,
                    subject_species_key,
                )

            # Convert to ArangoDB legal chars for arangodb _key
            subject_db_key = arangodb.arango_id_to_key(subject_key)
            object_db_key = arangodb.arango_id_to_key(object_key)

            # Subject node
            yield (
                ortholog_nodes_name,
                {
                    "_key": subject_db_key,
                    "key": subject_key,
                    "species_key": subject_species_key,
                    "source": source,
                    "version": version,
                },
            )
            # Object node
            yield (
                ortholog_nodes_name,
                {
                    "_key": object_db_key,
                    "key": object_key,
                    "species_key": object_species_key,
                    "source": source,
                    "version": version,
                },
            )

            arango_edge = {
                "_from": f"{ortholog_nodes_name}/{subject_db_key}",
                "_to": f"{ortholog_nodes_name}/{object_db_key}",
                "_key":
                bel.core.utils._create_hash(f"{subject_key}>>{object_key}"),
                "type": "ortholog_to",
                "source": source,
                "version": version,
            }

            statistics["entities_count"] += 1
            statistics["orthologous_pairs"][subject_species_key][
                object_species_key] += 1
            statistics["orthologous_pairs"][object_species_key][
                subject_species_key] += 1

            yield (arangodb.ortholog_edges_name, arango_edge)
Beispiel #7
0
def terms_iterator_for_arangodb(f: IO, version: str):
    """Generator for loading namespace terms into arangodb"""

    species_list = settings.BEL_FILTER_SPECIES

    f.seek(0)

    for line in f:
        term = json.loads(line)
        # skip if not term record (e.g. is a metadata record)
        if "term" not in term:
            continue
        term = term["term"]
        term_key = term["key"]
        namespace = term["namespace"]

        # Skip if species not listed in config species_list
        species_key = term.get("species_key", None)
        if species_list and species_key and species_key not in species_list:
            continue

        # Can't use original key formatted for Arangodb as some keys are longer than allowed (_key < 255 chars)
        term_db_key = arango_id_to_key(term_key)

        term["_key"] = term_db_key
        term["version"] = version

        # Add term record to terms collection
        yield (terms_coll_name, term)

        # Add primary ID node
        yield (
            equiv_nodes_name,
            {
                "_key": term_db_key,
                "key": term["key"],  # BEL Key - ns:id
                "primary": True,
                "namespace": namespace,
                "source": namespace,
                "version": version,
            },
        )

        # Create Alt ID nodes/equivalences (to support other database equivalences using non-preferred Namespace IDs)
        if "alt_keys" in term:
            for alt_key in term["alt_keys"]:
                # logger.info(f'Added {alt_id} equivalence')
                alt_db_key = arango_id_to_key(alt_key)

                yield (
                    equiv_nodes_name,
                    {
                        "_key": alt_db_key,
                        "key": alt_key,
                        "namespace": alt_key.split(":", 1)[0],
                        "source": namespace,
                        "version": version,
                    },
                )

                # Ensure only one edge per pair
                if term_db_key < alt_db_key:
                    from_ = term_db_key
                    to_ = alt_db_key
                else:
                    from_ = alt_db_key
                    to_ = term_db_key

                # Add edges for alt_keys
                arango_edge = {
                    "_from": f"{equiv_nodes_name}/{from_}",
                    "_to": f"{equiv_nodes_name}/{to_}",
                    "_key": arango_id_to_key(f"{from_}>>{to_}"),
                    "type": "equivalent_to",
                    "source": namespace,
                    "version": version,
                }
                yield (equiv_edges_name, arango_edge)

        # Cross-Namespace equivalences
        if "equivalence_keys" in term:
            for eqv_key in term["equivalence_keys"]:
                eqv_db_key = arango_id_to_key(eqv_key)

                equiv_node = (
                    equiv_nodes_name,
                    {
                        "_key": eqv_db_key,
                        "key": eqv_key,
                        "namespace": eqv_key.split(":", 1)[0],
                        "source": namespace,
                        "version": version,
                    },
                )

                yield equiv_node

                # Ensure only one edge per pair
                if term_db_key < eqv_db_key:
                    from_ = term_db_key
                    to_ = eqv_db_key
                else:
                    from_ = eqv_db_key
                    to_ = term_db_key

                equiv_edge = (
                    equiv_edges_name,
                    {
                        "_from": f"{equiv_nodes_name}/{from_}",
                        "_to": f"{equiv_nodes_name}/{to_}",
                        "_key": arango_id_to_key(f"{from_}>>{to_}"),
                        "type": "equivalent_to",
                        "source": namespace,
                        "version": version,
                    },
                )

                yield equiv_edge
Beispiel #8
0
def terms_iterator_for_arangodb(fo, version):

    species_list = config['bel_resources'].get('species_list', [])

    fo.seek(0)
    with gzip.open(fo, 'rt') as f:
        for line in f:
            term = json.loads(line)
            # skip if not term record (e.g. is a metadata record)
            if 'term' not in term:
                continue
            term = term['term']

            species_id = term.get('species_id', None)
            # Skip if species not listed in species_list
            if species_list and species_id and species_id not in species_list:
                continue

            source = term['namespace']
            term_id = term['id']
            term_key = arangodb.arango_id_to_key(term_id)

            (ns, val) = term_id.split(':', maxsplit=1)

            # Add primary ID node
            yield (arangodb.equiv_nodes_name, {
                '_key': term_key,
                'name': term_id,
                'primary': True,
                'namespace': ns,
                'source': source,
                'version': version
            })

            # Create Alt ID nodes/equivalences (to support other database equivalences using non-preferred Namespace IDs)
            if 'alt_ids' in term:
                for alt_id in term['alt_ids']:
                    # log.info(f'Added {alt_id} equivalence')
                    alt_id_key = arangodb.arango_id_to_key(alt_id)
                    yield (arangodb.equiv_nodes_name, {
                        '_key': alt_id_key,
                        'name': alt_id,
                        'namespace': ns,
                        'source': source,
                        'version': version
                    })

                    arango_edge = {
                        '_from': f"{arangodb.equiv_nodes_name}/{term_key}",
                        '_to': f"{arangodb.equiv_nodes_name}/{alt_id_key}",
                        '_key': bel.utils._create_hash(f'{term_id}>>{alt_id}'),
                        'type': 'equivalent_to',
                        'source': source,
                        'version': version,
                    }
                    yield (arangodb.equiv_edges_name, arango_edge)

            # Cross-DB equivalences
            if 'equivalences' in term:
                for eqv in term['equivalences']:
                    (ns, val) = eqv.split(':', maxsplit=1)
                    eqv_key = arangodb.arango_id_to_key(eqv)

                    yield (arangodb.equiv_nodes_name, {
                        '_key': eqv_key,
                        'name': eqv,
                        'namespace': ns,
                        'source': source,
                        'version': version
                    })

                    arango_edge = {
                        '_from': f"{arangodb.equiv_nodes_name}/{term_key}",
                        '_to': f"{arangodb.equiv_nodes_name}/{eqv_key}",
                        '_key': bel.utils._create_hash(f'{term_id}>>{eqv}'),
                        'type': 'equivalent_to',
                        'source': source,
                        'version': version,
                    }
                    yield (arangodb.equiv_edges_name, arango_edge)