def contact_source_direct(server, query, queue, vars=None, **kwargs): sparql_limit = kwargs.get("sparql_limit", 500) if "sparql@" in server: server_url = server.replace("sparql@", "") contact_single_endpoint_server(server_url, query, queue, vars, put_eof=False, limit=sparql_limit) queue.put(EOF()) elif "brtpf@" in server: server_url = server.replace("brtpf@", "") contact_single_brtpf_server(server_url, query, queue, vars, put_eof=False) queue.put(EOF()) elif "tpf@" in server: server_url = server.replace("tpf@", "") contact_single_tpf_server(server_url, query, queue, vars, put_eof=False) queue.put(EOF()) else: server_url = server contact_single_tpf_server(server_url, query, queue, vars, put_eof=False) queue.put(EOF())
def contact_source_bindings(servers, query, queue, bindings, vars=None, **kwargs): sparql_limit = kwargs.get("sparql_limit", 500) requests_cnts = [] for server in servers: if server in query.sources.keys(): if "sparql@" in server: server_url = server.replace("sparql@", "") requests_sparql = contact_single_endpoint_server_bindings( server_url, query, queue, bindings, vars, limit=sparql_limit) requests_cnts.append(requests_sparql) elif "brtpf@" in server: server_url = server.replace("brtpf@", "") requests_brtpf = contact_single_brtpf_server_binding( server_url, query, queue, bindings, vars) requests_cnts.append(requests_brtpf) elif "tpf@" in server: server_url = server.replace("tpf@", "") requests_tpf = contact_single_tpf_server_binding( server_url, query, queue, bindings, vars) requests_cnts.append(requests_tpf) else: requests_tpf = contact_single_tpf_server_binding( server, query, queue, bindings, vars) requests_cnts.append(requests_tpf) requests_total = sum(requests_cnts) eof = EOF(requests=requests_total) queue.put(eof)
def contact_single_tpf_server(server, query, queue, vars=None, binding={}, **kwargs): logger = logging.getLogger("nlde_logger") template = 0 qvars = [] param_dict = {} # Extract subject. subject = query.subject if subject.isvariable(): template = template | 4 qvars.append(subject.get_variable()) else: param_dict['subject'] = subject.value # Extract predicate. predicate = query.predicate if predicate.isvariable(): template = template | 2 qvars.append(predicate.get_variable()) else: param_dict['predicate'] = predicate.value # Extract object (value). value = query.object if value.isvariable(): template = template | 1 qvars.append(value.get_variable()) else: if value.isfloat(): param_dict['object'] = '"{}"^^{}'.format( value, "<http://www.w3.org/2001/XMLSchema#double>") elif value.isint(): param_dict['object'] = '"{}"^^{}'.format( value, "<http://www.w3.org/2001/XMLSchema#integer>") else: if not value.value.startswith("http") and not value.isuri( ) and value.value[0] != '"' and value.value[-1] != '"': param_dict['object'] = '"{}"'.format(value.value) else: param_dict['object'] = value.value # Literal in subject or predicate position: NOT a valid triple pattern. if subject.isliteral() or predicate.isliteral(): queue.put("EOF") return # When there are no variables in the TP, # Get the ones provides if len(qvars) == 0 and vars: qvars = vars #if vars: # qvars.append(vars) # Pagination settings. page = 1 next_page = True total = 0 count_requests = 0 card_sum = 0 elapsed = 0 while next_page: count_requests = count_requests + 1 # Establish connection and get response from server. param_dict['page'] = page response = requests.get(server, params=param_dict, headers=headers) elapsed += response.elapsed.total_seconds() #print response.url next_page = False # Successfully contacted the server. if response.status_code == 200: res = response.content if page == 1: # Get total solutions in fragment. matches = findall(triples_regex, res) if len(matches) == 1: total = int(matches[0]) else: break # Get solution mappings from fragment. if total > 0: myres = eval(res) card = parse_response(template, myres["@graph"], qvars, queue, server, 0, myres["@context"], binding) card_sum += card # Prepare next request. if "nextPage" in res: next_page = True elif "hydra:next" in res: next_page = True page = page + 1 else: print "Could not contact TPF server {}. Status code: {}".format( server, response.status_code) print response.headers if logger: mgs = { "requests": count_requests, "tuples": card_sum, "querypath": response.url, "timestamp": str(datetime.datetime.now()), "elapsed": elapsed, "interface": "tpf" } logger.info(str(mgs)) if kwargs.get("put_eof", False): eof = EOF(requests=count_requests) queue.put(eof) return count_requests
def contact_single_endpoint_server(server, query, queue, vars=None, **kwargs): logger = logging.getLogger("nlde_logger") logger_debug = logging.getLogger("nlde_debug") projection_vars = set() if not isinstance(query, list): query = [query] query_str = "\n".join([str(tp) for tp in query]) for tp in query: projection_vars.update(tp.variables) base_query = select_tmplt.format( " ".join(["?{}".format(var) for var in projection_vars]), query_str) # Pagination settings. limit = kwargs.get("limit", PAGE_SIZE) offset = 0 total = 0 count_requests = 0 card_sum = 0 elapsed = 0 while True: count_requests = count_requests + 1 if limit > 0: select_query = "{} LIMIT {} OFFSET {}".format( base_query, limit, offset * limit) else: select_query = base_query params = {"query": select_query, "format": "json", "timeout": "30000"} # Successfully contacted the server. response = http.get(server, params=params, headers=headers) elapsed += response.elapsed.total_seconds() total = -1 # Successfully contacted the server. if response.status_code == 200: res = response.json() results = 0 if len(res['results']['bindings']) == 0: break else: total += len(res['results']['bindings']) results = parse_response(queue, res['results']['bindings']) card_sum += results if limit == 0 or results < limit: break if limit == 0: break offset += 1 else: print "Could not contact SPARQL endpoint {}. Status code: {}".format( server, response.status_code) print response.url #print response.headers break if logger: mgs = { "requests": count_requests, "tuples": card_sum, "timestamp": str(datetime.datetime.now()), "elapsed": elapsed, "interface": "sparql" } logger.info(str(mgs)) mgs = {"querypath": response.url} logger_debug.info(str(mgs)) if kwargs.get("put_eof", False): eof = EOF(requests=count_requests) queue.put(eof)
def contact_source(servers, query, queue, vars=None, **kwargs): #if len(query.sources.keys()) == 1: # return contact_source_direct(query.sources.keys()[0], query, queue, vars, **kwargs) sparql_limit = kwargs.get("sparql_limit", 500) p_list = [] aux_queue = Queue() for server in servers: if server in query.sources.keys(): if "sparql@" in server: server_url = server.replace("sparql@", "") p = Process(target=contact_single_endpoint_server, args=(server_url, query, aux_queue, vars), kwargs={ "limit": sparql_limit, "put_eof": True }) p.start() p_list.append(p.pid) elif "brtpf@" in server: server_url = server.replace("brtpf@", "") p = Process(target=contact_single_brtpf_server, args=(server_url, query, aux_queue, vars), kwargs={"put_eof": True}) p.start() p_list.append(p.pid) elif "tpf@" in server: server_url = server.replace("tpf@", "") p = Process(target=contact_single_tpf_server, args=(server_url, query, aux_queue, vars), kwargs={"put_eof": True}) p.start() p_list.append(p.pid) else: server_url = server p = Process(target=contact_single_tpf_server, args=(server_url, query, aux_queue, vars), kwargs={"put_eof": True}) p.start() p_list.append(p.pid) if len(p_list) == 0: queue.put("EOF") return None requests = 0 processes_finalized = 0 while True: tuple = aux_queue.get() if tuple != "EOF": queue.put(tuple) else: requests += tuple.get("requests", 0) processes_finalized += 1 if processes_finalized == len(p_list): break eof = EOF(requests=requests) queue.put(eof)
def contact_single_endpoint_server_bindings(server, query, queue, bindings, vars=None, **kwargs): logger = logging.getLogger("nlde_logger") logger_debug = logging.getLogger("nlde_debug") if len(bindings) == 0: queue.put(EOF()) if not isinstance(query, list): query = [query] vars = " ".join(["?{}".format(var) for var in bindings[0].keys()]) query_str = "\n".join([str(tp) for tp in query]) bindings_str = "" bound_values = set() for binding in bindings: inner_str = "" for var, value in binding.items(): if value.startswith("http://") or value.startswith("https://"): inner_str += "<{}> ".format(value) else: # Escape Characters for to_replace, replace_with in ESCAPE_CHARS.items(): value = value.replace(to_replace, replace_with) inner_str += '"{}" '.format(value) if not inner_str in bound_values: bindings_str += "( {} )\n".format(inner_str) bound_values.add(inner_str) projection_vars = set() for tp in query: projection_vars.update(tp.variables) base_query = values_tmplt.format( " ".join(["?{}".format(var) for var in projection_vars]), query_str, "( {} )".format(vars), bindings_str) # Pagination settings. limit = kwargs.get("limit", PAGE_SIZE) offset = 0 count_requests = 0 card_sum = 0 elapsed = 0 while True: count_requests = count_requests + 1 if limit > 0: values_query = "{} LIMIT {} OFFSET {}".format( base_query, limit, offset * limit) else: values_query = base_query #print values_query params = {"query": values_query, "format": "json", "timeout": "30000"} # Successfully contacted the server. response = http.get(server, params=params, headers=headers) elapsed += response.elapsed.total_seconds() total = -1 # Successfully contacted the server. if response.status_code == 200: res = response.json() if len(res['results']['bindings']) == 0: break else: total += len(res['results']['bindings']) results = parse_response(queue, res['results']['bindings']) card_sum += results if limit == 0 or results < limit: break offset += 1 else: print "Could not contact SPARQL endpoint {}. Status code: {}".format( server, response.status_code) print response.url break if logger: mgs = { "requests": count_requests, "tuples": card_sum, "timestamp": str(datetime.datetime.now()), "elapsed": elapsed, "interface": "sparql" } logger.info(str(mgs)) mgs = {"querypath": response.url} logger_debug.info(str(mgs)) return count_requests