Esempio n. 1
0
def load_orthologs(fo: IO, metadata: dict):
    """Load orthologs into ArangoDB

    Args:
        fo: file obj - orthologs file
        metadata: dict containing the metadata for orthologs
    """

    version = metadata["metadata"]["version"]

    # LOAD ORTHOLOGS INTO ArangoDB
    with timy.Timer("Load Orthologs") as timer:

        arango_client = arangodb.get_client()
        if not arango_client:
            print("Cannot load orthologs without ArangoDB access")
            quit()
        belns_db = arangodb.get_belns_handle(arango_client)
        arangodb.batch_load_docs(belns_db,
                                 orthologs_iterator(fo, version),
                                 on_duplicate="update")

        log.info("Load orthologs",
                 elapsed=timer.elapsed,
                 source=metadata["metadata"]["source"])

        # Clean up old entries
        remove_old_ortholog_edges = f"""
            FOR edge in ortholog_edges
                FILTER edge.source == "{metadata["metadata"]["source"]}"
                FILTER edge.version != "{version}"
                REMOVE edge IN ortholog_edges
        """
        remove_old_ortholog_nodes = f"""
            FOR node in ortholog_nodes
                FILTER node.source == "{metadata["metadata"]["source"]}"
                FILTER node.version != "{version}"
                REMOVE node IN ortholog_nodes
        """
        arangodb.aql_query(belns_db, remove_old_ortholog_edges)
        arangodb.aql_query(belns_db, remove_old_ortholog_nodes)

    # Add metadata to resource metadata collection
    metadata["_key"] = f"Orthologs_{metadata['metadata']['source']}"
    try:
        belns_db.collection(arangodb.belns_metadata_name).insert(metadata)
    except ArangoError as ae:
        belns_db.collection(arangodb.belns_metadata_name).replace(metadata)
Esempio n. 2
0
def load_terms(fo: IO, metadata: dict, forceupdate: bool):
    """Load terms into Elasticsearch and ArangoDB

    Forceupdate will create a new index in Elasticsearch regardless of whether
    an index with the resource version already exists.

    Args:
        fo: file obj - terminology file
        metadata: dict containing the metadata for terminology
        forceupdate: force full update - e.g. don't leave Elasticsearch indexes
            alone if their version ID matches
    """

    version = metadata["metadata"]["version"]

    # LOAD TERMS INTO Elasticsearch
    with timy.Timer("Load Terms") as timer:
        es = bel.db.elasticsearch.get_client()

        es_version = version.replace("T", "").replace("-", "").replace(":", "")
        index_prefix = f"terms_{metadata['metadata']['namespace'].lower()}"
        index_name = f"{index_prefix}_{es_version}"

        # Create index with mapping
        if not elasticsearch.index_exists(es, index_name):
            elasticsearch.create_terms_index(es, index_name)
        elif forceupdate:  # force an update to the index
            index_name += "_alt"
            elasticsearch.create_terms_index(es, index_name)
        else:
            return  # Skip loading if not forced and not a new namespace

        terms_iterator = terms_iterator_for_elasticsearch(fo, index_name)
        elasticsearch.bulk_load_docs(es, terms_iterator)

        # Remove old namespace index
        index_names = elasticsearch.get_all_index_names(es)
        for name in index_names:
            if name != index_name and index_prefix in name:
                elasticsearch.delete_index(es, name)

        # Add terms_alias to this index
        elasticsearch.add_index_alias(es, index_name, terms_alias)

        log.info(
            "Load namespace terms",
            elapsed=timer.elapsed,
            namespace=metadata["metadata"]["namespace"],
        )

    # LOAD EQUIVALENCES INTO ArangoDB
    with timy.Timer("Load Term Equivalences") as timer:
        arango_client = arangodb.get_client()
        if not arango_client:
            print("Cannot load terms without ArangoDB access")
            quit()
        belns_db = arangodb.get_belns_handle(arango_client)
        arangodb.batch_load_docs(belns_db,
                                 terms_iterator_for_arangodb(fo, version),
                                 on_duplicate="update")

        log.info(
            "Loaded namespace equivalences",
            elapsed=timer.elapsed,
            namespace=metadata["metadata"]["namespace"],
        )

        # Clean up old entries
        remove_old_equivalence_edges = f"""
            FOR edge in equivalence_edges
                FILTER edge.source == "{metadata["metadata"]["namespace"]}"
                FILTER edge.version != "{version}"
                REMOVE edge IN equivalence_edges
        """
        remove_old_equivalence_nodes = f"""
            FOR node in equivalence_nodes
                FILTER node.source == "{metadata["metadata"]["namespace"]}"
                FILTER node.version != "{version}"
                REMOVE node IN equivalence_nodes
        """
        arangodb.aql_query(belns_db, remove_old_equivalence_edges)
        arangodb.aql_query(belns_db, remove_old_equivalence_nodes)

    # Add metadata to resource metadata collection
    metadata["_key"] = f"Namespace_{metadata['metadata']['namespace']}"
    try:
        belns_db.collection(arangodb.belns_metadata_name).insert(metadata)
    except ArangoError as ae:
        belns_db.collection(arangodb.belns_metadata_name).replace(metadata)
Esempio n. 3
0
def load_orthologs(fo: IO,
                   metadata: dict,
                   force: bool = False,
                   resource_download_url: Optional[str] = None):
    """Load orthologs into ArangoDB

    Args:
        fo: file obj - orthologs file
        metadata: dict containing the metadata for orthologs
    """

    result = {"state": "Succeeded", "messages": []}

    statistics = {
        "entities_count": 0,
        "orthologous_pairs": defaultdict(lambda: defaultdict(int))
    }

    version = metadata["version"]
    source = metadata["name"]

    metadata_key = metadata["name"]
    prior_metadata = resources_metadata_coll.get(metadata_key)

    try:
        prior_version = prior_metadata.get("version", "")
        prior_entity_count = prior_metadata["statistics"].get(
            "entities_count", 0)

    except Exception:
        prior_entity_count = 0
        prior_version = ""

    if force or prior_version != version:
        arangodb.batch_load_docs(resources_db,
                                 orthologs_iterator(fo, version, statistics),
                                 on_duplicate="update")
    else:
        msg = f"NOTE: This orthology dataset {source} at version {version} is already loaded and the 'force' option was not used"
        result["messages"].append(msg)
        return result

    logger.info(
        f"Loaded orthologs, source: {source}  count: {statistics['entities_count']}",
        source=source)

    if prior_entity_count > statistics["entities_count"]:
        logger.error(
            f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries"
        )

        result["state"] = "Failed"
        msg = f"Error: This orthology dataset {source} at version {version} has fewer orthologs than previously loaded orthology dataset. Skipped removing old ortholog entries"
        result["messages"].append(msg)
        return result

    remove_old_db_entries(source, version=version)

    # Add metadata to resource metadata collection
    metadata["_key"] = arangodb.arango_id_to_key(source)

    # Using side effect to get statistics from orthologs_iterator on purpose
    metadata["statistics"] = copy.deepcopy(statistics)

    if resource_download_url is not None:
        metadata["resource_download_url"] = resource_download_url

    resources_metadata_coll.insert(metadata, overwrite=True)

    result["messages"].append(
        f'Loaded {statistics["entities_count"]} ortholog sets into arangodb')
    return result
Esempio n. 4
0
def load_terms(f: IO,
               metadata: dict,
               force: bool = False,
               resource_download_url: Optional[str] = None):
    """Load terms into Elasticsearch and ArangoDB

    Force will create a new index in Elasticsearch regardless of whether
    an index with the resource version already exists.

    Args:
        fp: file path - terminology file
        metadata: dict containing the metadata for terminology
        force:  force full update - e.g. remove and re-add elasticsearch index
                and delete arangodb namespace records before loading
    """

    result = {"state": "Succeeded", "messages": []}

    metadata["statistics"] = {
        "entities_count": 0,
        "synonyms_count": 0,
        "entity_types": defaultdict(int),
        "annotation_types": defaultdict(int),
        "equivalenced_namespaces": defaultdict(int),
    }

    metadata_key = f"Namespace_{metadata['namespace']}"
    prior_metadata = resources_metadata_coll.get(metadata_key)

    try:
        prior_version = prior_metadata.get("version", "")
        prior_entity_count = prior_metadata["statistics"].get(
            "entities_count", 0)
    except Exception:
        prior_entity_count = 0
        prior_version = ""

    namespace = metadata["namespace"]
    version = metadata["version"]
    es_version = version.replace("T", "").replace("-", "").replace(":", "")
    index_prefix = f"{settings.TERMS_INDEX}_{namespace.lower()}"
    index_name = f"{index_prefix}_{es_version}"

    ################################################################################
    # Elasticsearch index processing
    ################################################################################
    # Create index with mapping
    if force or prior_version != version:
        elasticsearch.create_terms_index(index_name)
    else:
        result["state"] = "Succeeded"
        result["messages"].append(
            f'NOTE: This namespace {namespace} at version {version} is already loaded and the "force" option was not used'
        )

        return result

    # Using side effect to get statistics from terms_iterator_for_elasticsearch on purpose
    terms_iterator = terms_iterator_for_elasticsearch(f, index_name, metadata)
    elasticsearch.bulk_load_docs(terms_iterator)

    # Remove old namespace index
    index_names = elasticsearch.get_all_index_names()
    for name in index_names:
        if name != index_name and index_prefix in name:
            elasticsearch.delete_index(name)

    if not force and prior_entity_count > metadata["statistics"][
            "entities_count"]:
        logger.error(
            f'Problem loading namespace: {namespace}, previous entity count: {prior_entity_count}, current load entity count: {metadata["statistics"]["entities_count"]}'
        )

        result["state"] = "Failed"
        result["messages"].append(
            f'ERROR: Problem loading namespace: {namespace}, previous entity count: {prior_entity_count}, current load entity count: {metadata["statistics"]["entities_count"]}'
        )
        return result

    elif force and prior_entity_count > metadata["statistics"][
            "entities_count"]:
        result["state"] = "Warning"
        result["messages"].append(
            f'WARNING: New namespace: {namespace} is smaller, previous entity count: {prior_entity_count}, current load entity count: {metadata["statistics"]["entities_count"]}'
        )

    # Add terms alias to this index
    elasticsearch.add_index_alias(index_name, settings.TERMS_INDEX)

    ################################################################################
    # Arangodb collection loading
    ################################################################################
    if force:
        remove_old_db_entries(namespace, version=version, force=True)

    # LOAD Terms and equivalences INTO ArangoDB
    # Uses update on duplicate to allow primary on equivalence_nodes to not be overwritten
    batch_load_docs(resources_db,
                    terms_iterator_for_arangodb(f, version),
                    on_duplicate="update")

    # Add metadata to resource metadata collection
    metadata["_key"] = metadata_key

    if resource_download_url is not None:
        metadata["resource_download_url"] = resource_download_url

    resources_metadata_coll.insert(metadata, overwrite=True)
    clear_resource_metadata_cache()

    if not force:
        remove_old_db_entries(namespace, version=version)

    logger.info(
        f'Loaded Namespace: {namespace} with {metadata["statistics"]["entities_count"]} terms into elasticsearch: {settings.TERMS_INDEX}.{index_name} and arangodb collection: {terms_coll_name}',
        namespace=metadata["namespace"],
    )

    result["messages"].append(
        f'Loaded Namespace: {namespace} with {metadata["statistics"]["entities_count"]} terms into elasticsearch: {settings.TERMS_INDEX}.{index_name} and arangodb collection: {terms_coll_name}'
    )
    return result