def __call__(self, url): try: # validate URLs pieces = urlparse(url) if (not all([pieces.scheme, pieces.netloc]) or pieces.scheme not in ['http', 'https'] or set(pieces.netloc) > set(string.ascii_letters + string.digits + '-.:')): raise JsonLdError( 'Could not dereference URL; can only load URLs using', 'the "http" and "https" schemes.', 'jsonld.InvalidUrl', {'url': url}, code='loading document failed') response = self.session.get( url, headers={'Accept': 'application/ld+json, application/json'}) doc = { 'contextUrl': None, 'documentUrl': url, 'document': response.text } if self.use_cache: doc['from_cache'] = response.from_cache self.session.remove_expired_responses() # Save URL for Potential Extension contexts. try: data = json.loads(response.text) context = data['@context'] if any([isinstance(el, dict) for el in list_of(context)]): self.contexts.update([url]) except Exception: pass return doc except JsonLdError as e: raise e except Exception as cause: raise JsonLdError('Could not retrieve JSON-LD document from URL.', 'jsonld.LoadDocumentError', code='loading document failed', cause=cause)
def jsonld_document_loader(url): """ Retrieves JSON-LD at the given URL. :param url: the URL to retrieve. :return: the RemoteDocument. """ try: # validate URL pieces = urlparse(url) if (not all([pieces.scheme, pieces.netloc]) or pieces.scheme not in ['http', 'https'] or set(pieces.netloc) > set( string.ascii_letters + string.digits + '-.:')): raise JsonLdError( 'URL could not be dereferenced; only "http" and "https" ' 'URLs are supported.', 'jsonld.InvalidUrl', {'url': url}, code='loading document failed') response = requests.get( url, headers={'Accept': 'application/ld+json, application/json'} ) doc = { 'contextUrl': None, 'documentUrl': url, 'document': response.text } return doc except JsonLdError as e: raise e except Exception as cause: raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', code='loading document failed', cause=cause)
def loader(url, prev_url, options=None): """ Retrieves JSON-LD from a URL, a file location or as text :param url: the URL to retrieve. :param prev_url: Dictionary to carry the previous URL referenced :param options: Additional options :return: the RemoteDocument. """ if options is None: options = {} # Process text being passed in as the document if url.strip()[0] in '[{' or '\n' in url: return { 'contentType': 'text/plain', 'contextUrl': None, 'documentUrl': None, 'document': json.loads(url) } # process relative URL pieces = urlparse(url) if not any([pieces.scheme, pieces.netloc]): if prev_url['url']: url = urljoin(prev_url['url'], url) pieces = urlparse(url) else: prev_url['url'] = url # check for file access if pieces.scheme == 'file': try: with open(pieces.path) as f: doc = f.read() return { 'contentType': 'text/plain', 'contextUrl': None, 'documentUrl': url, 'document': json.loads(doc) } except Exception as cause: raise JsonLdError( f'Could not retrieve a JSON-LD document from {url}.', 'jsonld.LoadDocumentError', code='loading document failed', cause=cause) else: return requests_document_loader(**kwargs)(url, options)
async def async_loader(client, url): """ :param url: the URL to retrieve. :return: the RemoteDocument. """ try: if url.startswith('ipschema://'): o = urlparse(url) kList = await client.key.list() ipnsKey = None for key in kList['Keys']: if key['Name'] == o.netloc: ipnsKey = key['Id'] if ipnsKey is None: path = None else: path = IPFSPath(joinIpns(ipnsKey)).child(o.path) else: path = IPFSPath(url) if not path.valid: raise Exception('Not a valid path') if path and path.valid: data = await client.cat(path.objPath) return { 'document': json.loads(data.decode()), 'documentUrl': url, 'contextUrl': None } except aioipfs.APIError as e: log.debug(str(e)) raise e except JsonLdError as e: log.debug(str(e)) raise e except Exception as cause: raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', code='loading document failed', cause=cause)
async def async_loader(client, url, options={}): """ :param url: the URL to retrieve. :return: the RemoteDocument. """ try: o = urlparse(url) if o.scheme in ['ipschema', 'ips']: ipsKey = o.netloc if ipsKey == 'galacteek.ld.contexts': # Compat ipsKey = 'galacteek.ld' if not o.path or o.path == '/': return { 'contentType': 'application/ld+json', 'document': {}, 'documentUrl': url, 'contextUrl': None } raise JsonLdError(f'Invalid context path for URL: {url}', 'jsonld.InvalidUrl', {'url': url}, code='loading document failed') sIpfsPath = await ldSchemas.nsToIpfs(ipsKey) path = None if sIpfsPath is None else sIpfsPath.child(o.path) else: path = IPFSPath(url) if not path.valid: raise Exception(f'Not a valid path: {url}') if path and path.valid: data = await asyncio.wait_for(client.cat(path.objPath), 10) obj = orjson.loads(data.decode()) assert obj is not None return { 'contentType': 'application/ld+json', 'document': obj, 'documentUrl': url, 'contextUrl': None } except asyncio.TimeoutError as terr: log.debug(f'Timeout error while loading context: {terr}') raise terr except aioipfs.APIError as e: log.debug(f'IPFS error while loading context: {e}') raise e except JsonLdError as e: log.debug(str(e)) raise e except Exception as cause: raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', code='loading document failed', cause=cause)
def loader(url): """ Retrieves JSON-LD at the given URL. :param url: the URL to retrieve. :return: the RemoteDocument. """ try: # validate URL pieces = urllib_parse.urlparse(url) if (not all([pieces.scheme, pieces.netloc]) or pieces.scheme not in ["http", "https"] or set(pieces.netloc) > set(string.ascii_letters + string.digits + "-.:")): raise JsonLdError( 'URL could not be dereferenced; only "http" and "https" ' "URLs are supported.", "jsonld.InvalidUrl", {"url": url}, code="loading document failed", ) if secure and pieces.scheme != "https": raise JsonLdError( "URL could not be dereferenced; secure mode enabled and " 'the URL\'s scheme is not "https".', "jsonld.InvalidUrl", {"url": url}, code="loading document failed", ) headers = {"Accept": "application/ld+json, application/json"} response = session.get(url, headers=headers, **kwargs) doc = { "contextUrl": None, "documentUrl": response.url, "document": response.json(), } content_type = response.headers.get("content-type") link_header = response.headers.get("link") if link_header and content_type != "application/ld+json": link_header = parse_link_header(link_header).get( LINK_HEADER_REL) # only 1 related link header permitted if isinstance(link_header, list): raise JsonLdError( "URL could not be dereferenced, it has more than one " "associated HTTP Link Header.", "jsonld.LoadDocumentError", {"url": url}, code="multiple context link headers", ) if link_header: doc["contextUrl"] = link_header["target"] return doc except JsonLdError as e: raise e except Exception as cause: raise JsonLdError( "Could not retrieve a JSON-LD document from the URL.", "jsonld.LoadDocumentError", code="loading document failed", cause=cause, )
def test_page(ark): global elements elements = [] if request.cookies.get("fairscapeAuth") is None: token = request.headers.get("Authorization") else: token = request.cookies.get("fairscapeAuth") data_jsonld = requests.get(EG_URL + ark, headers={ "Authorization": token }).json() if 'error' in data_jsonld.keys(): return data_jsonld['error'] # try: try: data_jsonld_flat = jsonld.flatten(data_jsonld) # except Exception as cause: raise JsonLdError('Error flattening JSON-LD content ', cause) # print("\nflattened JSON-LD content\n", json.dumps(data_jsonld_flat, indent=2)) elements = [] # contains nodes and edges nodes = [] # vertices edges = [] # links between vertices id_position = {} # mapping of each @id to a number counter = 0 # TODO: to check if http://schema.org/name missing for level in data_jsonld_flat: if level.get('@id') is None or '_:b' in level[ '@id']: # flattening generates a blank node _:b when @id is missing print('Error: found blank node for missing @id at: ', level) sys.exit() if level.get(['@type'][0]) is None: print('Error: missing @type at: ', level) sys.exit() nodes_data = {} nodes_element = {} nodes_element['id'] = counter nodes_element['@id'] = level['@id'] if os.environ.get("LOCAL", False): nodes_element['href'] = 'http://*****:*****@id'] # href in cytoscape to open as a URI else: nodes_element['href'] = MDS_URL + level[ '@id'] # href in cytoscape to open as a URI nodes_element['@type'] = level['@type'][0] nodes_element['type'] = level['@type'][ 0] # @type cannot be retrieved as node(@type) nodes_element['name'] = level['https://schema.org/name'][0]['@value'] nodes_element['info'] = 'Name: ' + level['https://schema.org/name'][0]['@value'] + '\nType: ' + level['@type'][0] \ + '\nPID: ' + level['@id'] # all attributes together nodes_data['data'] = nodes_element nodes.append(nodes_data) id_position[level['@id']] = counter counter += 1 # print('\nNodes\n', json.dumps(nodes, indent=2)) # populate edges for item in data_jsonld_flat: source_id = item[ '@id'] # chooses @id as source at each level for an edge for key, value in item.items( ): # iterates through each flattened level if isinstance(value, list): for i in value: if isinstance(i, dict): if '@id' in i.keys(): edges_data = {} edges_element = {} edges_element['source'] = id_position[source_id] edges_element['target'] = id_position[i['@id']] edges_element['label'] = key edges_data['data'] = edges_element edges.append(edges_data) # print('\nEdges\n', json.dumps(edges, indent=2)) # copies all nodes and edges inside elements elements = nodes.copy() for element in edges: elements.append(element) # Convert multiple edges such that e1(v1, v2), e2(v1, v2), e3(v1, v2) => # Multiples edges between v1, v2 such as http://schema.org/founder, http://schema.org/member become [founder, member] source = [] target = [] label = [] for edge_data in edges: for edge in edge_data.values(): for key, value in edge.items(): if key == 'source': source.append(value) if key == 'target': target.append(value) if key == 'label': label.append(value) d = {'source': source, 'target': target, 'label': label} df = pd.DataFrame(data=d) # print('\nAll Edges\n', df) df_edge_has_common_nodes = df[df.duplicated(subset=['source', 'target'], keep=False)] # print('\nEdges with common nodes\n', df_edge_has_common_nodes) df_unique = df.drop_duplicates(subset=['source', 'target'], keep=False) # print('\nUnique Edges\n', df_unique) df_merged_edge_has_common_nodes = df_edge_has_common_nodes.groupby(['source', 'target'], as_index=False) \ .agg({'label': ','.join}) # print('\nMerged unique & non-unique edges\n', df_merged_edge_has_common_nodes) uri_prefix_suffix_dict_list = [ ] # Maps uri prefix to its suffix e.g. {http://schema.org/ : member" # populate common edge labels within [...] e.g. [founder, member, ...] def get_property_labels(labels): property_list = str(labels).split(',') if len(property_list) == 1: uri_prefix_suffix_dict = {} suffix = property_list[0].split('/')[-1] prefix = property_list[0].replace(suffix, '') uri_prefix_suffix_dict[prefix] = suffix uri_prefix_suffix_dict_list.append(uri_prefix_suffix_dict) return suffix elif len(property_list) > 1: property_list_size = len(property_list) # prop_list = [] # sending as a list does not add [] around the labels props_list = '[' # this string adds the anticipated [] for prop in property_list: uri_prefix_suffix_dict = {} suffix = prop.split('/')[-1] prefix = prop.replace(suffix, '') uri_prefix_suffix_dict[prefix] = suffix uri_prefix_suffix_dict_list.append(uri_prefix_suffix_dict) # prop_list.append(suffix) property_list_size -= 1 if property_list_size > 0: props_list += suffix + ', ' else: props_list += suffix # return prop_list return props_list + ']' elements = [] # reinitialize empty nodes and edges # Populate only unique edges which are not shared between two nodes for index, row in df_unique.iterrows(): edge_data = {} edges_element = {} edges_element['source'] = row['source'] edges_element['target'] = row['target'] property_label = get_property_labels(row['label']) if property_label is None: print('ERROR: Could not find property label!') sys.exit() edges_element['label'] = property_label edge_data['data'] = edges_element elements.append(edge_data) # Populate only those edges which are shared between two vertices for index, row in df_merged_edge_has_common_nodes.iterrows(): edge_data = {} edges_element = {} edges_element['source'] = row['source'] edges_element['target'] = row['target'] property_labels = get_property_labels(row['label']) if property_labels is None: print('ERROR: Could not find property labels!') sys.exit() edges_element['label'] = property_labels edge_data['data'] = edges_element elements.append(edge_data) # print('\nRefined Edges\n', elements) # Adding the nodes def is_node_in_edges(node, edges): edge_nodes = set() for edge_data_value in edges: edge_nodes.add(edge_data_value['data']['source']) edge_nodes.add(edge_data_value['data']['target']) if node['data']['id'] in edge_nodes: return True else: return False for node in nodes: if is_node_in_edges(node, edges): elements.append(node) # except: # return "Visual Failed. Probably missing type, name, or ID" print('Made it to render.') if os.environ.get("LOCAL", False): return render_template('local_index.html') return render_template('index.html')
def loader(url, options={}): """ Retrieves JSON-LD at the given URL. :param url: the URL to retrieve. :return: the RemoteDocument. """ try: # validate URL pieces = urllib_parse.urlparse(url) if (not all([pieces.scheme, pieces.netloc]) or pieces.scheme not in ['http', 'https'] or set(pieces.netloc) > set(string.ascii_letters + string.digits + '-.:')): raise JsonLdError( 'URL could not be dereferenced; only "http" and "https" ' 'URLs are supported.', 'jsonld.InvalidUrl', {'url': url}, code='loading document failed') if secure and pieces.scheme != 'https': raise JsonLdError( 'URL could not be dereferenced; secure mode enabled and ' 'the URL\'s scheme is not "https".', 'jsonld.InvalidUrl', {'url': url}, code='loading document failed') headers = options.get('headers') if headers is None: headers = {'Accept': 'application/ld+json, application/json'} response = requests.get(url, headers=headers, **kwargs) content_type = response.headers.get('content-type') if not content_type: content_type = 'application/octet-stream' doc = { 'contentType': content_type, 'contextUrl': None, 'documentUrl': response.url, } link_header = response.headers.get('link') if link_header: linked_context = parse_link_header(link_header).get( LINK_HEADER_REL) # only 1 related link header permitted if linked_context and content_type != 'application/ld+json': if isinstance(linked_context, list): raise JsonLdError( "URL could not be dereferenced, " "it has more than one " "associated HTTP Link Header.", "jsonld.LoadDocumentError", {"url": url}, code="multiple context link headers") doc["contextUrl"] = linked_context["target"] linked_alternate = parse_link_header(link_header).get( 'alternate') # if not JSON-LD, alternate may point there if (linked_alternate and linked_alternate.get('type') == 'application/ld+json' and not re.match( r'^application\/(\w*\+)?json$', content_type)): doc['contentType'] = 'application/ld+json' doc['documentUrl'] = prepend_base( url, linked_alternate['target']) return loader(doc['documentUrl'], options=options) doc["document"] = response.json() return doc except JsonLdError as e: raise e except Exception as cause: raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', code='loading document failed', cause=cause)
async def async_loader(url): """ Retrieves JSON-LD at the given URL asynchronously. :param url: the URL to retrieve. :return: the RemoteDocument. """ try: # validate URL pieces = urllib_parse.urlparse(url) if (not all([pieces.scheme, pieces.netloc]) or pieces.scheme not in ['http', 'https'] or set(pieces.netloc) > set(string.ascii_letters + string.digits + '-.:')): raise JsonLdError( 'URL could not be dereferenced; ' 'only "http" and "https" URLs are supported.', 'jsonld.InvalidUrl', {'url': url}, code='loading document failed') if secure and pieces.scheme != 'https': raise JsonLdError( 'URL could not be dereferenced; ' 'secure mode enabled and ' 'the URL\'s scheme is not "https".', 'jsonld.InvalidUrl', {'url': url}, code='loading document failed') headers = {'Accept': 'application/ld+json, application/json'} async with aiohttp.ClientSession(loop=loop) as session: async with session.get(url, headers=headers, **kwargs) as response: # Allow any content_type in trying to parse json # similar to requests library json_body = await response.json(content_type=None) doc = { 'contextUrl': None, 'documentUrl': response.url.human_repr(), 'document': json_body } content_type = response.headers.get('content-type') link_header = response.headers.get('link') if link_header and content_type != 'application/ld+json': link_header = parse_link_header(link_header).get( LINK_HEADER_REL) # only 1 related link header permitted if isinstance(link_header, list): raise JsonLdError( 'URL could not be dereferenced, ' 'it has more than one ' 'associated HTTP Link Header.', 'jsonld.LoadDocumentError', {'url': url}, code='multiple context link headers') if link_header: doc['contextUrl'] = link_header['target'] return doc except JsonLdError as e: raise e except Exception as cause: raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', code='loading document failed', cause=cause)