def sparql_query(
    request: Request,
    query: Optional[
        str] = "SELECT * WHERE { <https://identifiers.org/OMIM:246300> <https://w3id.org/biolink/vocab/treated_by> ?drug . }"
):
    # def sparql_query(query: Optional[str] = None):
    """
    Send a SPARQL query to be executed. 
    - Example with a drug: https://identifiers.org/DRUGBANK:DB00394
    - Example with a disease: https://identifiers.org/OMIM:246300
    \f
    :param query: SPARQL query input.
    """
    if not query:
        # TODO: return the SPARQL enndpoint service description
        return {"SPARQL Service": "description"}

    if request.headers['accept'] == 'text/csv':
        # TODO: return in CSV format
        return Response('a,b,c', media_type='text/csv')
    else:
        parsed_query = translateQuery(Query.parseString(query, parseAll=True))
        query_operation = re.sub(r"(\w)([A-Z])", r"\1 \2",
                                 parsed_query.algebra.name)
        if query_operation != "Select Query":
            return JSONResponse(
                status_code=501,
                content={"message": str(query_operation) + " not implemented"})
        print(parsed_query)
        print(query_operation)
        predictions_list = query_classifier_from_sparql(parsed_query)
        return predictions_list
Exemple #2
0
def get_metadata(rq):
    '''
    Returns the metadata 'exp' parsed from the raw query file 'rq'
    'exp' is one of: 'endpoint', 'tags', 'summary'
    '''
    yaml_string = "\n".join([row.lstrip('#+') for row in rq.split('\n') if row.startswith('#+')])
    query_string = "\n".join([row for row in rq.split('\n') if not row.startswith('#+')])

    query_metadata = yaml.load(yaml_string)
    # If there is no YAML string
    if query_metadata == None:
        query_metadata = {}
    query_metadata['query'] = query_string

    try:
        parsed_query = translateQuery(Query.parseString(rq, parseAll=True))
    except ParseException:
        app.logger.error("Could not parse query")
	app.logger.error(query_string)
        print traceback.print_exc()
    query_metadata['type'] = parsed_query.algebra.name

    if query_metadata['type'] == 'SelectQuery':
        query_metadata['variables'] = parsed_query.algebra['PV']

    return query_metadata
Exemple #3
0
def get_metadata(rq, endpoint):
    '''
    Returns the metadata 'exp' parsed from the raw query file 'rq'
    'exp' is one of: 'endpoint', 'tags', 'summary', 'request', 'pagination', 'enumerate'
    '''
    query_metadata = get_yaml_decorators(rq)
    query_metadata['type'] = 'UNKNOWN'

    try:
        # THE PARSING
        # select, describe, construct, ask
        parsed_query = translateQuery(Query.parseString(rq, parseAll=True))
        query_metadata['type'] = parsed_query.algebra.name
        if query_metadata['type'] == 'SelectQuery':
            # Projection variables
            query_metadata['variables'] = parsed_query.algebra['PV']
            # Parameters
            query_metadata['parameters'] = get_parameters(rq, parsed_query.algebra['_vars'], endpoint, query_metadata)
        elif query_metadata['type'] == 'ConstructQuery':
            # Parameters
            query_metadata['parameters'] = get_parameters(rq, parsed_query.algebra['_vars'], endpoint, query_metadata)
        else:
            glogger.warning("Query type {} is currently unsupported and no metadata was parsed!".format(query_metadata['type']))
    except ParseException:
        glogger.warning("Could not parse regular SELECT, CONSTRUCT, DESCRIBE or ASK query")
        # glogger.warning(traceback.print_exc())

        # insert queries won't parse, so we regex
        # glogger.info("Trying to parse INSERT query")
        # if static.INSERT_PATTERN in rq:
        #     query_metadata['type'] = 'InsertQuery'
        #     query_metadata['parameters'] = [u'_g_iri']

        try:
            # update query
            glogger.info("Trying to parse UPDATE query")
            parsed_query = UpdateUnit.parseString(rq, parseAll=True)
            glogger.info(parsed_query)
            query_metadata['type'] = parsed_query[0]['request'][0].name
            if query_metadata['type'] == 'InsertData':
                query_metadata['parameters'] = {'g': {'datatype': None, 'enum': [], 'lang': None, 'name': 'g', 'original': '?_g_iri', 'required': True, 'type': 'iri'},
                                                'data': {'datatype': None, 'enum': [], 'lang': None, 'name': 'data', 'original': '?_data', 'required': True, 'type': 'literal'}}

            glogger.info("Update query parsed with {}".format(query_metadata['type']))
            # if query_metadata['type'] == 'InsertData':
            #     query_metadata['variables'] = parsed_query.algebra['PV']
        except:
            glogger.error("Could not parse query")
            glogger.error(query_metadata['query'])
            glogger.error(traceback.print_exc())
            pass

    glogger.debug("Finished parsing query of type {}".format(query_metadata['type']))
    glogger.debug("All parsed query metadata (from decorators and content): ")
    glogger.debug(pformat(query_metadata, indent=32))

    return query_metadata
Exemple #4
0
def get_parameters(rq):
    """
        ?_name The variable specifies the API mandatory parameter name. The value is incorporated in the query as plain literal.
        ?__name The parameter name is optional.
        ?_name_iri The variable is substituted with the parameter value as a IRI (also: number or literal).
        ?_name_en The parameter value is considered as literal with the language 'en' (e.g., en,it,es, etc.).
        ?_name_integer The parameter value is considered as literal and the XSD datatype 'integer' is added during substitution.
        ?_name_prefix_datatype The parameter value is considered as literal and the datatype 'prefix:datatype' is added during substitution. The prefix must be specified according to the SPARQL syntax.
    """

    variables = translateQuery(Query.parseString(rq, parseAll=True)).algebra['_vars']

    ## Aggregates
    internal_matcher = re.compile("__agg_\d+__")
    ## Basil-style variables
    variable_matcher = re.compile("(?P<required>[_]{1,2})(?P<name>[^_]+)_?(?P<type>[a-zA-Z0-9]+)?_?(?P<userdefined>[a-zA-Z0-9]+)?.*$")

    parameters = {}
    for v in variables:
        if internal_matcher.match(v):
            continue

        match = variable_matcher.match(v)
        if match :
            vname = match.group('name')
            vrequired = True if match.group('required') == '_' else False
            vtype = 'iri'
            vlang = None
            vdatatype = None

            mtype = match.group('type')
            muserdefined = match.group('userdefined')

            if mtype in ['iri','number','literal']:
                vtype = mtype
            elif mtype:
                vtype = 'literal'

                if mtype:
                    if mtype in XSD_DATATYPES:
                        vdatatype = 'xsd:{}'.format(mtype)
                    elif len(mtype) == 2 :
                        vlang = mtype
                    elif muserdefined :
                        vdatatype = '{}:{}'.format(mtype, muserdefined)

            parameters[vname] = {
                'original': '?{}'.format(v),
                'required': vrequired,
                'name': vname,
                'type': vtype,
                'datatype': vdatatype,
                'lang': vlang
            }

    return parameters
Exemple #5
0
def get_metadata(rq):
    '''
    Returns the metadata 'exp' parsed from the raw query file 'rq'
    'exp' is one of: 'endpoint', 'tags', 'summary', 'request', 'pagination', 'enumerate'
    '''
    query_metadata = get_yaml_decorators(rq)

    try:
        # select, describe, construct, ask
        parsed_query = translateQuery(Query.parseString(rq, parseAll=True))
        query_metadata['type'] = parsed_query.algebra.name
        if query_metadata['type'] == 'SelectQuery':
            query_metadata['variables'] = parsed_query.algebra['PV']
    except ParseException:
        glogger.warning("Could not parse SELECT, DESCRIBE, CONSTRUCT, ASK query")
        # glogger.warning(traceback.print_exc())
        pass

        try:
            # insert, update query
            glogger.info("Trying to parse update query")
            parsed_query = UpdateUnit.parseString(rq, parseAll=True)
            glogger.info(parsed_query)
            query_metadata['type'] = parsed_query[0]['request'][0].name
            glogger.info("Update query parsed with {}".format(query_metadata['type']))
            # if query_metadata['type'] == 'InsertData':
            #     query_metadata['variables'] = parsed_query.algebra['PV']
        except:
            glogger.error("Could not parse UPDATE query")
            glogger.error(query_metadata['query'])
            glogger.error(traceback.print_exc())
            pass

    glogger.info("Finished parsing query of type {}".format(query_metadata['type']))

    return query_metadata
Exemple #6
0
def process_shapes_file(shape_format, shapes_graph, rdf_file_path, repo_url,
                        branch, repo_description):
    """Process a file, check its content and add entry to the shapes graph
    Large function, contain parsing for all formats: RDF, OBO, ShEx, OpenAPI, etc
    """
    relative_filepath = str(rdf_file_path)[12:]
    github_file_url = generate_github_file_url(repo_url, relative_filepath,
                                               branch)
    file_uri = URIRef(github_file_url)
    shape_found = False
    g = Graph()

    if shape_format == 'obo':
        # Get OBO ontologies
        try:
            graph = obonet.read_obo(github_file_url)
            # for id_, data in graph.nodes(data=True):
            for id_, data in graph.nodes(data=True):
                shape_found = True
                shapes_graph.add(
                    (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
                shapes_graph.add(
                    (file_uri, RDF.type, SIO['SIO_000623']))  # OBO ontology
                shapes_graph.add(
                    (file_uri, RDFS.label, Literal(rdf_file_path.name)))
                shapes_graph.add(
                    (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
                shape_label = data.get('name')
                if not shape_label:
                    shape_label = id_
                shapes_graph.add(
                    (file_uri, DCTERMS.hasPart, Literal(shape_label)))
        except Exception as e:
            add_to_report('In repository: ' + repo_url + "\n> " + str(e),
                          github_file_url)

    # Index OpenAPI files
    elif shape_format == 'openapi':
        try:
            parser = ResolvingParser(github_file_url)
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SCHEMA['APIReference']))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            file_descriptions = []
            if parser.specification['info']['title']:
                file_descriptions.append(parser.specification['info']['title'])
            if parser.specification['info']['description']:
                file_descriptions.append(
                    parser.specification['info']['description'])
            if len(file_descriptions) > 0:
                shapes_graph.add((file_uri, RDFS.comment,
                                  Literal(' - '.join(file_descriptions))))
            # if not shape_label:
            #   shape_label = id_
            # TODO: get operations hasPart?
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('OpenAPI')))
        except Exception as e:
            pass
            # TODO: YARRML? Search for prefixes and mappings at the root of YAML
            # add_to_report('In repository: ' + repo_url + "\n> "
            #       + str(e), github_file_url)

    # Search for shex files
    elif shape_format == 'shex':
        # No parsing possible for shex
        shape_found = True
        # TODO: use https://schema.org/SoftwareSourceCode ?
        shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
        shapes_graph.add((file_uri, RDF.type, SHEX.Schema))
        shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name)))
        shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('ShEx model')))
        shapes_graph.add((file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
        # Convert ShEx to RDF shex and parse it
        # shex_rdf = ''
        # if rdf_file_path.endswith('.shex'):
        #   with open(root / '../' + rdf_file_path, 'a') as f:
        #     shex_rdf = generate_shexj.parse(f.read())
        # # if rdf_file_path.endswith('.shexj'):
        # #   with open(root / '../' + rdf_file_path, 'a') as f:
        # #     shex_rdf = f.read()
        # logging.debug(shex_rdf)
        # # for shape in g.subjects(RDF.type, SHEX.ShapeAnd):
        # #     add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema)
        # # for shape in g.subjects(RDF.type, SHEX.Shape):
        # #     add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema)

    # Parse SPARQL query files
    elif shape_format == 'sparql':
        # TODO: sparql+queries search failing might be due to a test SPARQL query hanging for long time
        shape_found = True
        shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
        shapes_graph.add((file_uri, RDF.type, SH.SPARQLFunction))
        shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name)))
        shapes_graph.add((file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
        try:
            with open(rdf_file_path.absolute()) as file:
                sparql_query = file.read()
                # Parse SPARQL query (added fix for some malformed queries with =+ instead of #+)
                sparql_query = "\n".join([
                    '#+' + row.lstrip('=+') for row in sparql_query.split('\n')
                    if row.startswith('=+')
                ])
                yaml_string = "\n".join([
                    row.lstrip('#+') for row in sparql_query.split('\n')
                    if row.startswith('#+')
                ])
                query_string = "\n".join([
                    row for row in sparql_query.split('\n')
                    if not row.startswith('#+')
                ])
                shapes_graph.add(
                    (file_uri, SCHEMA['query'], Literal(sparql_query)))

                grlc_metadata = {}
                try:  # Invalid YAMLs will produce empty metadata
                    grlc_metadata = yaml.load(yaml_string,
                                              Loader=yaml.FullLoader)
                except:
                    pass
                # Get grlc query metadata
                if grlc_metadata:
                    file_descriptions = []
                    if 'endpoint' in grlc_metadata:
                        sparql_endpoint = grlc_metadata['endpoint']
                        try:
                            shapes_graph.add((file_uri, VOID.sparqlEndpoint,
                                              URIRef(sparql_endpoint)))
                            test_sparql_endpoint(sparql_endpoint)
                        except Exception as e:
                            logging.debug(
                                'Issue parsing SPARQL endpoint from .rq file')
                            logging.debug(e)
                    if 'summary' in grlc_metadata and grlc_metadata['summary']:
                        file_descriptions.append(grlc_metadata['summary'])
                    if 'description' in grlc_metadata and grlc_metadata[
                            'description']:
                        file_descriptions.append(grlc_metadata['description'])
                    # Add the query description to the graph
                    if len(file_descriptions) > 0:
                        shapes_graph.add(
                            (file_uri, RDFS.comment,
                             Literal(' - '.join(file_descriptions))))
                    # If default params described for grlc SPARQL query we add them as shapes
                    if 'defaults' in grlc_metadata:
                        for args in grlc_metadata['defaults']:
                            for arg, default_label in args.items():
                                shapes_graph.add(
                                    (file_uri, DCTERMS.hasPart, Literal(arg)))
                try:
                    # Parse the query to get its operation (select, construct..)
                    parsed_query = translateQuery(
                        Query.parseString(query_string, parseAll=True))
                    query_operation = re.sub(r"(\w)([A-Z])", r"\1 \2",
                                             parsed_query.algebra.name)
                    shapes_graph.add(
                        (file_uri, DCTERMS.hasPart, Literal(query_operation)))
                except:
                    shapes_graph.add(
                        (file_uri, DCTERMS.hasPart, Literal('SPARQL Query')))
        except:
            logging.error('❌️ Issue opening file: ' + str(rdf_file_path))

    # Parse RDF files
    else:
        try:
            if shape_format == 'trig':
                # Different graph required for trig to work
                g = ConjunctiveGraph()
            g.parse(str(rdf_file_path.absolute()), format=shape_format)
        except Exception as e:
            if shape_format == 'xml' and (str(rdf_file_path).endswith('.owl')
                                          or
                                          str(rdf_file_path).endswith('.rdf')):
                # Try parsing with turtle for .owl and .rdf files
                try:
                    g.parse(str(rdf_file_path.absolute()), format='ttl')
                except:
                    add_to_report(
                        'RDF parsed as ' + shape_format + ', in repository: ' +
                        repo_url + "\n> " + str(e), github_file_url)
            else:
                add_to_report(
                    'RDF parsed as ' + shape_format + ', in repository: ' +
                    repo_url + "\n> " + str(e), github_file_url)

        # Search for SHACL shapes
        for shape in g.subjects(RDF.type, SH.NodeShape):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SH.Shape))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the shape
                shape_label = label
                # Fixing
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

        # Search for CSV on the Web RDF (csvw)
        # https://medium.swirrl.com/how-to-publish-csv-on-the-web-csvw-4ea6cbb603b4
        # https://www.w3.org/ns/csvw
        for shape_file in g.subjects(RDF.type, CSVW.Schema):
            # for shape_file in g.objects(None, CSVW.tableSchema):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, CSVW.Schema))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            # Get file label
            # for file_label in g.objects(shape_file, RDFS.label):
            #   shapes_graph.add((file_uri, RDFS.comment, Literal(str(file_label))))
            #   break
            # Get columns label
            for col_label in g.objects(shape_file, CSVW.column):
                shapes_graph.add(
                    (file_uri, DCTERMS.hasPart, Literal(str(col_label))))

        # Search for DCAT Datasets
        for shape_file in g.subjects(RDF.type, DCAT.Dataset):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, DCAT.Dataset))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            # Get file label
            for file_label in g.objects(shape_file, RDFS.label):
                shapes_graph.add(
                    (file_uri, RDFS.comment, Literal(str(file_label))))
                break
            # shape_label = shape_file
            # for label in g.objects(shape_file, RDFS.label):
            #     # Try to get the label of the shape
            #     shape_label = label
            #     # Fixing
            # shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

        # Search for nanopublication templates
        for shape_file in g.subjects(RDF.type, NP_TEMPLATE.AssertionTemplate):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add(
                (file_uri, RDF.type, NP_TEMPLATE.AssertionTemplate))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            # Get template label
            for template_label in g.objects(shape_file, RDFS.label):
                shapes_graph.add(
                    (file_uri, RDFS.comment, Literal(str(template_label))))
                break
            # TODO: get the shapes inside
            nanopub_inputs = [
                NP_TEMPLATE.GuidedChoicePlaceholder,
                NP_TEMPLATE.LiteralPlaceholder,
                NP_TEMPLATE.RestrictedChoicePlaceholder,
                NP_TEMPLATE.UriPlaceholder
            ]
            for np_input in nanopub_inputs:
                for shape in g.subjects(RDF.type, np_input):
                    shape_label = shape
                    for label in g.objects(shape, RDFS.label):
                        # Try to get the label of the shape
                        shape_label = label
                        # Fixing
                    shapes_graph.add(
                        (file_uri, DCTERMS.hasPart, Literal(shape_label)))

        # Search for RML and R2RML mappings
        for shape in g.subjects(RDF.type, R2RML.SubjectMap):
            shape_found = True
            is_rml_mappings = False
            # Differenciate RML and R2RML mappings
            if (None, RML.logicalSource, None) in g:
                shapes_graph.add((file_uri, RDF.type, RML.LogicalSource))
            else:
                shapes_graph.add((file_uri, RDF.type, R2RML.TriplesMap))
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            shape_label = shape
            # Try to get the label or URI of the subjectMap
            for label in g.objects(shape, R2RML.template):
                shape_label = label
            for label in g.objects(shape, RDFS.label):
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

        # Search for OWL classes
        for shape in g.subjects(RDF.type, OWL.Class):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, OWL.Ontology))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the class
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

        # Get rdfs:label of owl:Ontology and shaclTest:Validate for file description
        file_descriptions = []
        for shape in g.subjects(RDF.type, OWL.Ontology):
            # Get one of the labels
            for ontology_label in g.objects(shape, RDFS.label):
                if len(file_descriptions) < 1:
                    file_descriptions.append(str(ontology_label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DC.title):
                    file_descriptions.append(str(label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DCTERMS.title):
                    file_descriptions.append(str(label))
            # Now add the description
            for comment in g.objects(shape, RDFS.comment):
                file_descriptions.append(str(comment))
            for label in g.objects(shape, RDFS.comment):
                file_descriptions.append(str(label))
            for description in g.objects(shape, DCTERMS.description):
                file_descriptions.append(str(description))
        for shape in g.subjects(
                RDF.type, URIRef('http://www.w3.org/ns/shacl-test#Validate')):
            for ontology_label in g.objects(shape, RDFS.label):
                file_descriptions.append(str(ontology_label))
        if len(file_descriptions) > 0:
            shapes_graph.add((file_uri, RDFS.comment,
                              Literal(' - '.join(file_descriptions))))

        # Get SKOS concepts and concept scheme
        for shape in g.subjects(RDF.type, SKOS.Concept):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SKOS.ConceptScheme))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, SKOS.prefLabel):
                # Try to get the label of the class
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))
        for shape in g.subjects(RDF.type, SKOS.ConceptScheme):
            # Get one of the labels
            for ontology_label in g.objects(shape, RDFS.label):
                if len(file_descriptions) < 1:
                    file_descriptions.append(str(ontology_label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DC.title):
                    file_descriptions.append(str(label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DCTERMS.title):
                    file_descriptions.append(str(label))
            # Now add the description
            for comment in g.objects(shape, RDFS.comment):
                file_descriptions.append(str(comment))
            for label in g.objects(shape, RDFS.comment):
                file_descriptions.append(str(label))
            for description in g.objects(shape, DCTERMS.description):
                file_descriptions.append(str(description))

        # Search for ShEx Shapes and ShapeAnd
        # TODO: Improve
        for shape in g.subjects(RDF.type, SHEX.ShapeAnd):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SHEX.Schema))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the shape
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))
        for shape in g.subjects(RDF.type, SHEX.Shape):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SHEX.Schema))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add(
                (file_uri, SCHEMA.codeRepository, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the shape
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

    # Add the git repo to the graph
    if shape_found:
        logging.debug('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") +
                      '] ' + "✔️ Shape found in file " + github_file_url)
        shapes_graph.add((URIRef(repo_url), RDF.type, SCHEMA['DataCatalog']))
        shapes_graph.add(
            (URIRef(repo_url), RDFS.label, Literal(repo_url.rsplit('/',
                                                                   1)[1])))
        if (repo_description):
            shapes_graph.add(
                (URIRef(repo_url), RDFS.comment, Literal(repo_description)))

    return shapes_graph
Exemple #7
0
def get_parameters(rq, endpoint):
    """
        ?_name The variable specifies the API mandatory parameter name. The value is incorporated in the query as plain literal.
        ?__name The parameter name is optional.
        ?_name_iri The variable is substituted with the parameter value as a IRI (also: number or literal).
        ?_name_en The parameter value is considered as literal with the language 'en' (e.g., en,it,es, etc.).
        ?_name_integer The parameter value is considered as literal and the XSD datatype 'integer' is added during substitution.
        ?_name_prefix_datatype The parameter value is considered as literal and the datatype 'prefix:datatype' is added during substitution. The prefix must be specified according to the SPARQL syntax.
    """

    variables = translateQuery(Query.parseString(
        rq, parseAll=True)).algebra['_vars']

    ## Aggregates
    internal_matcher = re.compile("__agg_\d+__")
    ## Basil-style variables
    variable_matcher = re.compile(
        "(?P<required>[_]{1,2})(?P<name>[^_]+)_?(?P<type>[a-zA-Z0-9]+)?_?(?P<userdefined>[a-zA-Z0-9]+)?.*$"
    )

    parameters = {}
    for v in variables:
        if internal_matcher.match(v):
            continue

        match = variable_matcher.match(v)
        # TODO: currently only one parameter per triple pattern is supported
        tpattern_matcher = re.compile(
            ".*FROM\s+(?P<gnames>.*)\s+WHERE.*[\.\{][\n\t\s]*(?P<tpattern>.*\?"
            + re.escape(v) + ".*)\..*",
            flags=re.DOTALL)
        tp_match = tpattern_matcher.match(rq)
        if match:
            if tp_match:
                vtpattern = tp_match.group('tpattern')
                gnames = tp_match.group('gnames')
                glogger.debug("Matched triple pattern with parameter")
                # glogger.debug(vtpattern)
                # glogger.debug(gnames)
                codes_subquery = re.sub("SELECT.*\{.*\}",
                                        "SELECT DISTINCT ?" + v + " FROM " +
                                        gnames + " WHERE { " + vtpattern +
                                        " . }",
                                        rq,
                                        flags=re.DOTALL)
                headers = {'Accept': 'application/json'}
                data = {'query': codes_subquery}
                data_encoded = urllib.urlencode(data)
                req = urllib2.Request(endpoint, data_encoded, headers)
                glogger.debug("Sending code subquery request: " +
                              req.get_full_url() + "?" + req.get_data())
                response = urllib2.urlopen(req)
                codes_json = json.loads(response.read())
                # glogger.debug(codes_json)
                vcodes = []
                for code in codes_json['results']['bindings']:
                    vcodes.append(code.values()[0]["value"])
                # glogger.debug(vcodes)

            vname = match.group('name')
            vrequired = True if match.group('required') == '_' else False
            vtype = 'literal'
            vlang = None
            vdatatype = None

            mtype = match.group('type')
            muserdefined = match.group('userdefined')

            if mtype in ['iri', 'number', 'literal']:
                vtype = mtype
            elif mtype:
                vtype = 'literal'

                if mtype:
                    if mtype in static.XSD_DATATYPES:
                        vdatatype = 'xsd:{}'.format(mtype)
                    elif len(mtype) == 2:
                        vlang = mtype
                    elif muserdefined:
                        vdatatype = '{}:{}'.format(mtype, muserdefined)

            parameters[vname] = {
                'original': '?{}'.format(v),
                'required': vrequired,
                'name': vname,
                'enum': sorted(vcodes),
                'type': vtype,
                'datatype': vdatatype,
                'lang': vlang
            }

    return parameters
Exemple #8
0
def process_shapes_file(shape_format, shapes_graph, rdf_file_path, repo_url,
                        branch, repo_description):
    """Process a Shapes file, check its content and add entry to the shapes graph
    Large function, contain parsing for all formats: RDF, OBO, ShEx, OpenAPI...
    """
    relative_filepath = str(rdf_file_path)[12:]
    github_file_url = generate_github_file_url(repo_url, relative_filepath,
                                               branch)
    file_uri = URIRef(github_file_url)
    shape_found = False
    g = Graph()

    if shape_format == 'obo':
        # Get OBO ontologies
        try:
            graph = obonet.read_obo(github_file_url)
            # for id_, data in graph.nodes(data=True):
            for id_, data in graph.nodes(data=True):
                shape_found = True
                shapes_graph.add(
                    (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
                shapes_graph.add(
                    (file_uri, RDF.type, SIO['SIO_000623']))  # OBO ontology
                shapes_graph.add(
                    (file_uri, RDFS.label, Literal(rdf_file_path.name)))
                shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
                shape_label = data.get('name')
                if not shape_label:
                    shape_label = id_
                shapes_graph.add(
                    (file_uri, DCTERMS.hasPart, Literal(shape_label)))
        except Exception as e:
            # print('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + '] 🗑 Issue with OBO parser for file ' + github_file_url)
            add_to_report('File: ' + github_file_url + "\n\n" +
                          'In repository: ' + repo_url + "\n> " + str(e) +
                          "\n\n---\n")

    # Index OpenAPI files
    elif shape_format == 'openapi':
        try:
            parser = ResolvingParser(github_file_url)
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SCHEMA['WebAPI']))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
            file_descriptions = []
            if parser.specification['info']['title']:
                file_descriptions.append(parser.specification['info']['title'])
            if parser.specification['info']['description']:
                file_descriptions.append(
                    parser.specification['info']['description'])
            if len(file_descriptions) > 0:
                shapes_graph.add((file_uri, DC.description,
                                  Literal(' - '.join(file_descriptions))))
            # if not shape_label:
            #   shape_label = id_
            # TODO: get operations hasPart?
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('OpenAPI')))
        except Exception as e:
            pass
            # print('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + '] 🗑 Issue with OpenAPI parser for file ' + github_file_url)
            # print(e)
            # add_to_report('File: ' + github_file_url + "\n\n"
            #       + 'In repository: ' + repo_url + "\n> "
            #       + str(e) + "\n\n---\n")

    # Search for shex files
    elif shape_format == 'shex':
        # no parsing possible for shex
        shape_found = True
        # TODO: use https://schema.org/SoftwareSourceCode ?
        shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
        shapes_graph.add((file_uri, RDF.type, SHEX.Schema))
        shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name)))
        shapes_graph.add((file_uri, DCTERMS.hasPart, Literal('ShEx model')))
        shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
        # Convert ShEx to RDF shex and parse it
        # shex_rdf = ''
        # if rdf_file_path.endswith('.shex'):
        #   with open(root / '../' + rdf_file_path, 'a') as f:
        #     shex_rdf = generate_shexj.parse(f.read())
        # # if rdf_file_path.endswith('.shexj'):
        # #   with open(root / '../' + rdf_file_path, 'a') as f:
        # #     shex_rdf = f.read()
        # print(shex_rdf)
        # # for shape in g.subjects(RDF.type, SHEX.ShapeAnd):
        # #     add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema)
        # # for shape in g.subjects(RDF.type, SHEX.Shape):
        # #     add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape, SHEX.schema)

    # Parse SPARQL query files
    elif shape_format == 'sparql':
        shape_found = True
        shapes_graph.add((file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
        shapes_graph.add((file_uri, RDF.type, SH.SPARQLFunction))
        shapes_graph.add((file_uri, RDFS.label, Literal(rdf_file_path.name)))
        shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
        with open(rdf_file_path.absolute()) as file:
            sparql_query = file.read()
            # Parse SPARQL query
            yaml_string = "\n".join([
                row.lstrip('#+') for row in sparql_query.split('\n')
                if row.startswith('#+')
            ])
            query_string = "\n".join([
                row for row in sparql_query.split('\n')
                if not row.startswith('#+')
            ])
            shapes_graph.add(
                (file_uri, SCHEMA['query'], Literal(query_string)))

            grlc_metadata = {}
            try:  # Invalid YAMLs will produce empty metadata
                grlc_metadata = yaml.load(yaml_string, Loader=yaml.FullLoader)
            except:
                pass
            # Get metadata like grlc metadata
            if grlc_metadata:
                file_descriptions = []
                if 'endpoint' in grlc_metadata:
                    sparql_endpoint = grlc_metadata['endpoint']
                    shapes_graph.add((file_uri, VOID.sparqlEndpoint,
                                      Literal(sparql_endpoint)))
                    # TODO: check if in hashes of already tested endpoints valid and failing3
                    # Test endpoint with SPARQLWrapper, add it to hash of valid or failing endpoints
                    # Then, like repos, add them as schema:EntryPoint
                    if sparql_endpoint not in VALID_ENDPOINTS.keys(
                    ) and sparql_endpoint not in FAILED_ENDPOINTS.keys():
                        sparql_test_query = 'SELECT * WHERE { ?s ?p ?o } LIMIT 10'
                        sparql = SPARQLWrapper(sparql_endpoint)
                        sparql.setReturnFormat(JSON)
                        sparql.setQuery(sparql_test_query)
                        try:
                            results = sparql.query().convert()
                            # Check SPARQL query sent back at least 5 triples
                            results_array = results["results"]["bindings"]
                            if len(results_array) > 4:
                                VALID_ENDPOINTS[sparql_endpoint] = {
                                    'label': sparql_endpoint
                                }
                            else:
                                FAILED_ENDPOINTS[sparql_endpoint] = 'failed'
                        except Exception as e:
                            add_to_report('SPARQL endpoint failed: ' +
                                          sparql_endpoint + "\n\n" + str(e) +
                                          "\n\n---\n")

                if 'summary' in grlc_metadata and grlc_metadata['summary']:
                    file_descriptions.append(grlc_metadata['summary'])
                if 'description' in grlc_metadata and grlc_metadata[
                        'description']:
                    file_descriptions.append(grlc_metadata['description'])

                if len(file_descriptions) > 0:
                    shapes_graph.add((file_uri, DC.description,
                                      Literal(' - '.join(file_descriptions))))
                # If default params described for grlc SPARQL query we add then as shapes
                if 'defaults' in grlc_metadata:
                    for args in grlc_metadata['defaults']:
                        for arg, default_label in args.items():
                            shapes_graph.add(
                                (file_uri, DCTERMS.hasPart, Literal(arg)))

            try:
                # Parse query to get its operation (select, construct..)
                parsed_query = translateQuery(
                    Query.parseString(query_string, parseAll=True))
                query_operation = re.sub(r"(\w)([A-Z])", r"\1 \2",
                                         parsed_query.algebra.name)
                shapes_graph.add(
                    (file_uri, DCTERMS.hasPart, Literal(query_operation)))
            except:
                shapes_graph.add(
                    (file_uri, DCTERMS.hasPart, Literal('SPARQL Query')))

    # Parse RDF files
    else:
        try:
            g.parse(str(rdf_file_path.absolute()), format=shape_format)
        except Exception as e:
            print('[' + datetime.now().strftime("%m/%d/%Y, %H:%M:%S") +
                  '] 🗑 RDF parser for ' + shape_format +
                  ' did not worked for the file ' + github_file_url)
            if not str(rdf_file_path).endswith('.xml') and not str(
                    rdf_file_path).endswith('.json'):
                add_to_report('File: ' + github_file_url + " parsed as " +
                              shape_format + "\n\n" + 'In repository: ' +
                              repo_url + "\n> " + str(e) + "\n\n---\n")

        # Search for SHACL shapes
        for shape in g.subjects(RDF.type, SH.NodeShape):
            # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape_uri, shape_type)
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SH.Shape))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the shape
                shape_label = label
                # Fixing
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

        # Search for OWL classes, limit to max 300 classes/concepts retrieved
        classes_limit = 300
        classes_count = 0
        for shape in g.subjects(RDF.type, OWL.Class):
            # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape_uri, shape_type)
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, OWL.Ontology))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the class
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))
            classes_count += 1
            if classes_count >= classes_limit:
                break

        # Get rdfs:label of owl:Ontology and shaclTest:Validate for file description
        file_descriptions = []
        for shape in g.subjects(RDF.type, OWL.ontology):
            # Get one of the labels
            for ontology_label in g.objects(shape, RDFS.label):
                if len(file_descriptions) < 1:
                    file_descriptions.append(str(ontology_label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DC.title):
                    file_descriptions.append(str(label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DCTERMS.title):
                    file_descriptions.append(str(label))
            # Now add the description
            for comment in g.objects(shape, RDFS.comment):
                file_descriptions.append(str(comment))
            for label in g.objects(shape, DC.description):
                file_descriptions.append(str(label))
            for description in g.objects(shape, DCTERMS.description):
                file_descriptions.append(str(description))
        for shape in g.subjects(
                RDF.type, URIRef('http://www.w3.org/ns/shacl-test#Validate')):
            for ontology_label in g.objects(shape, RDFS.label):
                file_descriptions.append(str(ontology_label))
        if len(file_descriptions) > 0:
            shapes_graph.add((file_uri, DC.description,
                              Literal(' - '.join(file_descriptions))))

        # Get SKOS concepts and concept scheme
        classes_count = 0
        for shape in g.subjects(RDF.type, SKOS.Concept):
            # add_shape_to_graph(shapes_graph, rdf_file_path, github_file_url, repo_url, shape_uri, shape_type)
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SKOS.ConceptScheme))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, SKOS.prefLabel):
                # Try to get the label of the class
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))
            classes_count += 1
            if classes_count >= classes_limit:
                break
        for shape in g.subjects(RDF.type, SKOS.ConceptScheme):
            # Get one of the labels
            for ontology_label in g.objects(shape, RDFS.label):
                if len(file_descriptions) < 1:
                    file_descriptions.append(str(ontology_label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DC.title):
                    file_descriptions.append(str(label))
            if len(file_descriptions) == 0:
                for label in g.objects(shape, DCTERMS.title):
                    file_descriptions.append(str(label))
            # Now add the description
            for comment in g.objects(shape, RDFS.comment):
                file_descriptions.append(str(comment))
            for label in g.objects(shape, DC.description):
                file_descriptions.append(str(label))
            for description in g.objects(shape, DCTERMS.description):
                file_descriptions.append(str(description))

        # Search for ShEx Shapes and ShapeAnd
        # TODO: Improve
        for shape in g.subjects(RDF.type, SHEX.ShapeAnd):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SHEX.Schema))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the shape
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

        for shape in g.subjects(RDF.type, SHEX.Shape):
            shape_found = True
            shapes_graph.add(
                (file_uri, RDF.type, SCHEMA['SoftwareSourceCode']))
            shapes_graph.add((file_uri, RDF.type, SHEX.Schema))
            shapes_graph.add(
                (file_uri, RDFS.label, Literal(rdf_file_path.name)))
            shapes_graph.add((file_uri, DC.source, URIRef(repo_url)))
            shape_label = shape
            for label in g.objects(shape, RDFS.label):
                # Try to get the label of the shape
                shape_label = label
            shapes_graph.add((file_uri, DCTERMS.hasPart, Literal(shape_label)))

    # Add repository RDF
    if shape_found:
        shapes_graph.add(
            (URIRef(repo_url), RDF.type, SCHEMA['codeRepository']))
        # TODO: change, schema:codeRepository is a property, not a class, but not much available..
        shapes_graph.add(
            (URIRef(repo_url), RDFS.label, Literal(repo_url.rsplit('/',
                                                                   1)[1])))
        if (repo_description):
            shapes_graph.add(
                (URIRef(repo_url), RDFS.comment, Literal(repo_description)))

    return shapes_graph