def get_entities_dataframe(
    spark_session: SparkSession,
    impress_api_url,
    impress_type: str,
    impress_ids: List[str],
) -> DataFrame:
    """

    :param spark_session:
    :param impress_api_url:
    :param impress_type:
    :param impress_ids:
    :return:
    """
    entities = [
        get_impress_entity_by_id(impress_api_url, impress_type, impress_id)
        for impress_id in impress_ids
    ]
    entity_df = spark_session.createDataFrame(
        convert_to_row(entity) for entity in entities)
    current_type = ""
    current_schema = entity_df.schema
    entity_df = process_collection(spark_session, impress_api_url,
                                   current_schema, current_type, entity_df)
    unit_df = get_impress_units(impress_api_url, spark_session)
    entity_df = entity_df.join(
        unit_df, entity_df["parameter.unit"] == unit_df["unitID"],
        "left_outer")
    return entity_df
def extract_ontology_terms(spark_session: SparkSession,
                           ontologies_path: str) -> DataFrame:
    """

    :param spark_session:
    :param ontologies_path:
    :return:
    """
    directory = os.fsencode(ontologies_path)
    ontology_terms = []
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".owl"):
            onto_path.append(os.path.join(directory, filename))
            ontology = get_ontology(None).load()
            ontology_terms.extend(parse_ontology(ontology))
    ontology_terms_df = spark_session.createDataFrame(
        convert_to_row(term) for term in ontology_terms)
    return ontology_terms_df
Beispiel #3
0
def get_entities_dataframe(spark_session: SparkSession, impress_api_url,
                           impress_type: str,
                           impress_ids: List[str]) -> DataFrame:
    """

    :param spark_session:
    :param impress_api_url:
    :param impress_type:
    :param impress_ids:
    :return:
    """
    entities = [
        get_impress_entity_by_id(impress_api_url, impress_type, impress_id)
        for impress_id in impress_ids
    ]
    entity_df = spark_session.createDataFrame(
        convert_to_row(entity) for entity in entities)
    current_type = ''
    current_schema = entity_df.schema
    entity_df = process_collection(spark_session, impress_api_url,
                                   current_schema, current_type, entity_df)
    return entity_df
Beispiel #4
0
def get_entities_dataframe(
    spark_session: SparkSession,
    impress_api_url,
    impress_type: str,
    impress_ids: List[str],
    proxy_map: Dict,
) -> DataFrame:
    """
    Using a list of entities identifiers (e.g. pipeline IDs),
    returns a DataFrame containing all the information about those entities.
    It also fires up the process to start crawling the collection belonging to each entity
    (e.g. the procedures contained on a pipeline).
    """

    # Gets the information for a list of entities with a specific identifier.
    entities = [
        get_impress_entity_by_id(impress_api_url, impress_type, impress_id,
                                 proxy_map) for impress_id in impress_ids
    ]
    entity_df = spark_session.createDataFrame(
        convert_to_row(entity) for entity in entities)

    # Extends the current entity with the data coming from its child collections
    current_type = ""
    current_schema = entity_df.schema
    entity_df = process_collection(
        spark_session,
        impress_api_url,
        current_schema,
        current_type,
        entity_df,
        proxy_map,
    )
    unit_df = get_impress_units(impress_api_url, spark_session, proxy_map)
    entity_df = entity_df.join(
        unit_df, entity_df["parameter.unit"] == unit_df["unitID"],
        "left_outer")
    return entity_df