Esempio n. 1
0
    def __call__(self, url):
        try:
            # validate URLs
            pieces = urlparse(url)
            if (not all([pieces.scheme, pieces.netloc])
                    or pieces.scheme not in ['http', 'https']
                    or set(pieces.netloc) >
                    set(string.ascii_letters + string.digits + '-.:')):
                raise JsonLdError(
                    'Could not dereference URL; can only load URLs using',
                    'the "http" and "https" schemes.',
                    'jsonld.InvalidUrl', {'url': url},
                    code='loading document failed')

            response = self.session.get(
                url,
                headers={'Accept': 'application/ld+json, application/json'})

            doc = {
                'contextUrl': None,
                'documentUrl': url,
                'document': response.text
            }

            if self.use_cache:
                doc['from_cache'] = response.from_cache
                self.session.remove_expired_responses()

            # Save URL for Potential Extension contexts.
            try:
                data = json.loads(response.text)
                context = data['@context']
                if any([isinstance(el, dict) for el in list_of(context)]):
                    self.contexts.update([url])
            except Exception:
                pass

            return doc

        except JsonLdError as e:
            raise e
        except Exception as cause:
            raise JsonLdError('Could not retrieve JSON-LD document from URL.',
                              'jsonld.LoadDocumentError',
                              code='loading document failed',
                              cause=cause)
Esempio n. 2
0
def jsonld_document_loader(url):
    """
    Retrieves JSON-LD at the given URL.

    :param url: the URL to retrieve.

    :return: the RemoteDocument.
    """
    try:
        # validate URL
        pieces = urlparse(url)
        if (not all([pieces.scheme, pieces.netloc]) or
            pieces.scheme not in ['http', 'https'] or
            set(pieces.netloc) > set(
                string.ascii_letters + string.digits + '-.:')):
            raise JsonLdError(
                'URL could not be dereferenced; only "http" and "https" '
                'URLs are supported.',
                'jsonld.InvalidUrl', {'url': url},
                code='loading document failed')

        response = requests.get(
            url, headers={'Accept': 'application/ld+json, application/json'}
        )

        doc = {
            'contextUrl': None,
            'documentUrl': url,
            'document': response.text
        }

        return doc
    except JsonLdError as e:
        raise e
    except Exception as cause:
        raise JsonLdError(
            'Could not retrieve a JSON-LD document from the URL.',
            'jsonld.LoadDocumentError', code='loading document failed',
            cause=cause)
Esempio n. 3
0
    def loader(url, prev_url, options=None):
        """
        Retrieves JSON-LD from a URL, a file location or as text

        :param url: the URL to retrieve.
        :param prev_url: Dictionary to carry the previous URL referenced
        :param options: Additional options

        :return: the RemoteDocument.
        """
        if options is None:
            options = {}

        # Process text being passed in as the document
        if url.strip()[0] in '[{' or '\n' in url:
            return {
                'contentType': 'text/plain',
                'contextUrl': None,
                'documentUrl': None,
                'document': json.loads(url)
            }

        # process relative URL
        pieces = urlparse(url)
        if not any([pieces.scheme, pieces.netloc]):
            if prev_url['url']:
                url = urljoin(prev_url['url'], url)
                pieces = urlparse(url)
        else:
            prev_url['url'] = url

        # check for file access
        if pieces.scheme == 'file':
            try:
                with open(pieces.path) as f:
                    doc = f.read()
                return {
                    'contentType': 'text/plain',
                    'contextUrl': None,
                    'documentUrl': url,
                    'document': json.loads(doc)
                }
            except Exception as cause:
                raise JsonLdError(
                    f'Could not retrieve a JSON-LD document from {url}.',
                    'jsonld.LoadDocumentError',
                    code='loading document failed',
                    cause=cause)
        else:
            return requests_document_loader(**kwargs)(url, options)
Esempio n. 4
0
    async def async_loader(client, url):
        """
        :param url: the URL to retrieve.

        :return: the RemoteDocument.
        """
        try:
            if url.startswith('ipschema://'):
                o = urlparse(url)
                kList = await client.key.list()

                ipnsKey = None
                for key in kList['Keys']:
                    if key['Name'] == o.netloc:
                        ipnsKey = key['Id']

                if ipnsKey is None:
                    path = None
                else:
                    path = IPFSPath(joinIpns(ipnsKey)).child(o.path)
            else:
                path = IPFSPath(url)
                if not path.valid:
                    raise Exception('Not a valid path')

            if path and path.valid:
                data = await client.cat(path.objPath)

                return {
                    'document': json.loads(data.decode()),
                    'documentUrl': url,
                    'contextUrl': None
                }

        except aioipfs.APIError as e:
            log.debug(str(e))
            raise e
        except JsonLdError as e:
            log.debug(str(e))
            raise e
        except Exception as cause:
            raise JsonLdError(
                'Could not retrieve a JSON-LD document from the URL.',
                'jsonld.LoadDocumentError', code='loading document failed',
                cause=cause)
Esempio n. 5
0
    async def async_loader(client, url, options={}):
        """
        :param url: the URL to retrieve.

        :return: the RemoteDocument.
        """
        try:
            o = urlparse(url)
            if o.scheme in ['ipschema', 'ips']:
                ipsKey = o.netloc

                if ipsKey == 'galacteek.ld.contexts':
                    # Compat
                    ipsKey = 'galacteek.ld'

                if not o.path or o.path == '/':
                    return {
                        'contentType': 'application/ld+json',
                        'document': {},
                        'documentUrl': url,
                        'contextUrl': None
                    }

                    raise JsonLdError(f'Invalid context path for URL: {url}',
                                      'jsonld.InvalidUrl', {'url': url},
                                      code='loading document failed')

                sIpfsPath = await ldSchemas.nsToIpfs(ipsKey)
                path = None if sIpfsPath is None else sIpfsPath.child(o.path)
            else:
                path = IPFSPath(url)
                if not path.valid:
                    raise Exception(f'Not a valid path: {url}')

            if path and path.valid:
                data = await asyncio.wait_for(client.cat(path.objPath), 10)

                obj = orjson.loads(data.decode())
                assert obj is not None

                return {
                    'contentType': 'application/ld+json',
                    'document': obj,
                    'documentUrl': url,
                    'contextUrl': None
                }
        except asyncio.TimeoutError as terr:
            log.debug(f'Timeout error while loading context: {terr}')
            raise terr
        except aioipfs.APIError as e:
            log.debug(f'IPFS error while loading context: {e}')
            raise e
        except JsonLdError as e:
            log.debug(str(e))
            raise e
        except Exception as cause:
            raise JsonLdError(
                'Could not retrieve a JSON-LD document from the URL.',
                'jsonld.LoadDocumentError',
                code='loading document failed',
                cause=cause)
Esempio n. 6
0
    def loader(url):
        """
        Retrieves JSON-LD at the given URL.

        :param url: the URL to retrieve.

        :return: the RemoteDocument.
        """
        try:
            # validate URL
            pieces = urllib_parse.urlparse(url)
            if (not all([pieces.scheme, pieces.netloc])
                    or pieces.scheme not in ["http", "https"]
                    or set(pieces.netloc) >
                    set(string.ascii_letters + string.digits + "-.:")):
                raise JsonLdError(
                    'URL could not be dereferenced; only "http" and "https" '
                    "URLs are supported.",
                    "jsonld.InvalidUrl",
                    {"url": url},
                    code="loading document failed",
                )
            if secure and pieces.scheme != "https":
                raise JsonLdError(
                    "URL could not be dereferenced; secure mode enabled and "
                    'the URL\'s scheme is not "https".',
                    "jsonld.InvalidUrl",
                    {"url": url},
                    code="loading document failed",
                )
            headers = {"Accept": "application/ld+json, application/json"}
            response = session.get(url, headers=headers, **kwargs)

            doc = {
                "contextUrl": None,
                "documentUrl": response.url,
                "document": response.json(),
            }
            content_type = response.headers.get("content-type")
            link_header = response.headers.get("link")
            if link_header and content_type != "application/ld+json":
                link_header = parse_link_header(link_header).get(
                    LINK_HEADER_REL)
                # only 1 related link header permitted
                if isinstance(link_header, list):
                    raise JsonLdError(
                        "URL could not be dereferenced, it has more than one "
                        "associated HTTP Link Header.",
                        "jsonld.LoadDocumentError",
                        {"url": url},
                        code="multiple context link headers",
                    )
                if link_header:
                    doc["contextUrl"] = link_header["target"]
            return doc
        except JsonLdError as e:
            raise e
        except Exception as cause:
            raise JsonLdError(
                "Could not retrieve a JSON-LD document from the URL.",
                "jsonld.LoadDocumentError",
                code="loading document failed",
                cause=cause,
            )
Esempio n. 7
0
def test_page(ark):
    global elements
    elements = []

    if request.cookies.get("fairscapeAuth") is None:
        token = request.headers.get("Authorization")
    else:
        token = request.cookies.get("fairscapeAuth")

    data_jsonld = requests.get(EG_URL + ark, headers={
        "Authorization": token
    }).json()

    if 'error' in data_jsonld.keys():
        return data_jsonld['error']
    # try:
    try:
        data_jsonld_flat = jsonld.flatten(data_jsonld)  #
    except Exception as cause:
        raise JsonLdError('Error flattening JSON-LD content ', cause)

    # print("\nflattened JSON-LD content\n", json.dumps(data_jsonld_flat, indent=2))

    elements = []  # contains nodes and edges
    nodes = []  # vertices
    edges = []  # links between vertices
    id_position = {}  # mapping of each @id to a number
    counter = 0

    # TODO: to check if http://schema.org/name missing
    for level in data_jsonld_flat:
        if level.get('@id') is None or '_:b' in level[
                '@id']:  # flattening generates a blank node _:b when @id is missing
            print('Error: found blank node for missing @id at: ', level)
            sys.exit()
        if level.get(['@type'][0]) is None:
            print('Error: missing @type at: ', level)
            sys.exit()
        nodes_data = {}
        nodes_element = {}
        nodes_element['id'] = counter
        nodes_element['@id'] = level['@id']
        if os.environ.get("LOCAL", False):
            nodes_element['href'] = 'http://*****:*****@id']  # href in cytoscape to open as a URI
        else:
            nodes_element['href'] = MDS_URL + level[
                '@id']  # href in cytoscape to open as a URI
        nodes_element['@type'] = level['@type'][0]
        nodes_element['type'] = level['@type'][
            0]  # @type cannot be retrieved as node(@type)
        nodes_element['name'] = level['https://schema.org/name'][0]['@value']
        nodes_element['info'] = 'Name: ' + level['https://schema.org/name'][0]['@value'] + '\nType: ' + level['@type'][0] \
                                + '\nPID: ' + level['@id']  # all attributes together
        nodes_data['data'] = nodes_element
        nodes.append(nodes_data)
        id_position[level['@id']] = counter
        counter += 1

    # print('\nNodes\n', json.dumps(nodes, indent=2))

    # populate edges
    for item in data_jsonld_flat:
        source_id = item[
            '@id']  # chooses @id as source at each level for an edge
        for key, value in item.items(
        ):  # iterates through each flattened level
            if isinstance(value, list):
                for i in value:
                    if isinstance(i, dict):
                        if '@id' in i.keys():
                            edges_data = {}
                            edges_element = {}
                            edges_element['source'] = id_position[source_id]
                            edges_element['target'] = id_position[i['@id']]
                            edges_element['label'] = key
                            edges_data['data'] = edges_element
                            edges.append(edges_data)

    # print('\nEdges\n', json.dumps(edges, indent=2))

    # copies all nodes and edges inside elements
    elements = nodes.copy()
    for element in edges:
        elements.append(element)

    # Convert multiple edges such that e1(v1, v2), e2(v1, v2), e3(v1, v2) =>
    # Multiples edges between v1, v2 such as http://schema.org/founder, http://schema.org/member become [founder, member]
    source = []
    target = []
    label = []
    for edge_data in edges:
        for edge in edge_data.values():
            for key, value in edge.items():
                if key == 'source':
                    source.append(value)
                if key == 'target':
                    target.append(value)
                if key == 'label':
                    label.append(value)

    d = {'source': source, 'target': target, 'label': label}
    df = pd.DataFrame(data=d)

    # print('\nAll Edges\n', df)

    df_edge_has_common_nodes = df[df.duplicated(subset=['source', 'target'],
                                                keep=False)]
    # print('\nEdges with common nodes\n', df_edge_has_common_nodes)

    df_unique = df.drop_duplicates(subset=['source', 'target'], keep=False)
    # print('\nUnique Edges\n', df_unique)

    df_merged_edge_has_common_nodes = df_edge_has_common_nodes.groupby(['source', 'target'], as_index=False) \
        .agg({'label': ','.join})

    # print('\nMerged unique & non-unique edges\n', df_merged_edge_has_common_nodes)

    uri_prefix_suffix_dict_list = [
    ]  # Maps uri prefix to its suffix e.g. {http://schema.org/ : member"

    # populate common edge labels within [...] e.g. [founder, member, ...]
    def get_property_labels(labels):
        property_list = str(labels).split(',')
        if len(property_list) == 1:
            uri_prefix_suffix_dict = {}
            suffix = property_list[0].split('/')[-1]
            prefix = property_list[0].replace(suffix, '')
            uri_prefix_suffix_dict[prefix] = suffix
            uri_prefix_suffix_dict_list.append(uri_prefix_suffix_dict)
            return suffix
        elif len(property_list) > 1:
            property_list_size = len(property_list)
            # prop_list = []  # sending as a list does not add [] around the labels
            props_list = '['  # this string adds the anticipated []
            for prop in property_list:
                uri_prefix_suffix_dict = {}
                suffix = prop.split('/')[-1]
                prefix = prop.replace(suffix, '')
                uri_prefix_suffix_dict[prefix] = suffix
                uri_prefix_suffix_dict_list.append(uri_prefix_suffix_dict)
                # prop_list.append(suffix)
                property_list_size -= 1
                if property_list_size > 0:
                    props_list += suffix + ', '
                else:
                    props_list += suffix
            # return prop_list
            return props_list + ']'

    elements = []  # reinitialize empty nodes and edges
    # Populate only unique edges which are not shared between two nodes
    for index, row in df_unique.iterrows():
        edge_data = {}
        edges_element = {}
        edges_element['source'] = row['source']
        edges_element['target'] = row['target']
        property_label = get_property_labels(row['label'])
        if property_label is None:
            print('ERROR: Could not find property label!')
            sys.exit()
        edges_element['label'] = property_label
        edge_data['data'] = edges_element
        elements.append(edge_data)

    # Populate only those edges which are shared between two vertices
    for index, row in df_merged_edge_has_common_nodes.iterrows():
        edge_data = {}
        edges_element = {}
        edges_element['source'] = row['source']
        edges_element['target'] = row['target']
        property_labels = get_property_labels(row['label'])
        if property_labels is None:
            print('ERROR: Could not find property labels!')
            sys.exit()
        edges_element['label'] = property_labels
        edge_data['data'] = edges_element
        elements.append(edge_data)

    # print('\nRefined Edges\n', elements)

    # Adding the nodes
    def is_node_in_edges(node, edges):
        edge_nodes = set()
        for edge_data_value in edges:
            edge_nodes.add(edge_data_value['data']['source'])
            edge_nodes.add(edge_data_value['data']['target'])
        if node['data']['id'] in edge_nodes:
            return True
        else:
            return False

    for node in nodes:
        if is_node_in_edges(node, edges):
            elements.append(node)
    # except:
    #     return "Visual Failed. Probably missing type, name, or ID"
    print('Made it to render.')
    if os.environ.get("LOCAL", False):
        return render_template('local_index.html')
    return render_template('index.html')
Esempio n. 8
0
    def loader(url, options={}):
        """
        Retrieves JSON-LD at the given URL.
        :param url: the URL to retrieve.
        :return: the RemoteDocument.
        """
        try:
            # validate URL
            pieces = urllib_parse.urlparse(url)
            if (not all([pieces.scheme, pieces.netloc])
                    or pieces.scheme not in ['http', 'https']
                    or set(pieces.netloc) >
                    set(string.ascii_letters + string.digits + '-.:')):
                raise JsonLdError(
                    'URL could not be dereferenced; only "http" and "https" '
                    'URLs are supported.',
                    'jsonld.InvalidUrl', {'url': url},
                    code='loading document failed')
            if secure and pieces.scheme != 'https':
                raise JsonLdError(
                    'URL could not be dereferenced; secure mode enabled and '
                    'the URL\'s scheme is not "https".',
                    'jsonld.InvalidUrl', {'url': url},
                    code='loading document failed')
            headers = options.get('headers')
            if headers is None:
                headers = {'Accept': 'application/ld+json, application/json'}
            response = requests.get(url, headers=headers, **kwargs)

            content_type = response.headers.get('content-type')
            if not content_type:
                content_type = 'application/octet-stream'
            doc = {
                'contentType': content_type,
                'contextUrl': None,
                'documentUrl': response.url,
            }
            link_header = response.headers.get('link')
            if link_header:
                linked_context = parse_link_header(link_header).get(
                    LINK_HEADER_REL)
                # only 1 related link header permitted
                if linked_context and content_type != 'application/ld+json':
                    if isinstance(linked_context, list):
                        raise JsonLdError(
                            "URL could not be dereferenced, "
                            "it has more than one "
                            "associated HTTP Link Header.",
                            "jsonld.LoadDocumentError", {"url": url},
                            code="multiple context link headers")
                    doc["contextUrl"] = linked_context["target"]
                linked_alternate = parse_link_header(link_header).get(
                    'alternate')
                # if not JSON-LD, alternate may point there
                if (linked_alternate and linked_alternate.get('type')
                        == 'application/ld+json' and not re.match(
                            r'^application\/(\w*\+)?json$', content_type)):
                    doc['contentType'] = 'application/ld+json'
                    doc['documentUrl'] = prepend_base(
                        url, linked_alternate['target'])
                    return loader(doc['documentUrl'], options=options)
            doc["document"] = response.json()
            return doc
        except JsonLdError as e:
            raise e
        except Exception as cause:
            raise JsonLdError(
                'Could not retrieve a JSON-LD document from the URL.',
                'jsonld.LoadDocumentError',
                code='loading document failed',
                cause=cause)
Esempio n. 9
0
    async def async_loader(url):
        """
        Retrieves JSON-LD at the given URL asynchronously.

        :param url: the URL to retrieve.

        :return: the RemoteDocument.
        """
        try:
            # validate URL
            pieces = urllib_parse.urlparse(url)
            if (not all([pieces.scheme, pieces.netloc])
                    or pieces.scheme not in ['http', 'https']
                    or set(pieces.netloc) >
                    set(string.ascii_letters + string.digits + '-.:')):
                raise JsonLdError(
                    'URL could not be dereferenced; '
                    'only "http" and "https" URLs are supported.',
                    'jsonld.InvalidUrl', {'url': url},
                    code='loading document failed')
            if secure and pieces.scheme != 'https':
                raise JsonLdError(
                    'URL could not be dereferenced; '
                    'secure mode enabled and '
                    'the URL\'s scheme is not "https".',
                    'jsonld.InvalidUrl', {'url': url},
                    code='loading document failed')
            headers = {'Accept': 'application/ld+json, application/json'}
            async with aiohttp.ClientSession(loop=loop) as session:
                async with session.get(url, headers=headers,
                                       **kwargs) as response:
                    # Allow any content_type in trying to parse json
                    # similar to requests library
                    json_body = await response.json(content_type=None)
                    doc = {
                        'contextUrl': None,
                        'documentUrl': response.url.human_repr(),
                        'document': json_body
                    }
                    content_type = response.headers.get('content-type')
                    link_header = response.headers.get('link')
                    if link_header and content_type != 'application/ld+json':
                        link_header = parse_link_header(link_header).get(
                            LINK_HEADER_REL)
                        # only 1 related link header permitted
                        if isinstance(link_header, list):
                            raise JsonLdError(
                                'URL could not be dereferenced, '
                                'it has more than one '
                                'associated HTTP Link Header.',
                                'jsonld.LoadDocumentError', {'url': url},
                                code='multiple context link headers')
                        if link_header:
                            doc['contextUrl'] = link_header['target']
                    return doc
        except JsonLdError as e:
            raise e
        except Exception as cause:
            raise JsonLdError(
                'Could not retrieve a JSON-LD document from the URL.',
                'jsonld.LoadDocumentError',
                code='loading document failed',
                cause=cause)