def _save(self, action, general=True): """ Stores data relating to the recovery of a fragment for this request """ super(FragmentSink, self)._save(action) # Recover pattern from the request object self._graph_pattern = action.request.pattern effective_gp = self._generalize_gp() if general else self._graph_pattern # fragment_mapping is a tuple like (fragment_id, mapping) fragment_mapping = self.__check_gp_mappings(gp=effective_gp) exists = fragment_mapping is not None # Decide to proceed depending on whether it's the first time this request is received and the fragment # is already known proceed = action.id in self.passed_requests or ( random() > 1.0 - PASS_THRESHOLD if not exists else random() > PASS_THRESHOLD) if not proceed: self.do_pass(action) if action.id in self.passed_requests: self.passed_requests.remove(action.id) if not exists: # If there is no mapping, register a new fragment collection for the general graph pattern fragment_id = str(uuid()) self._fragment_key = self.__f_key_pattern.format(fragment_id) self._pipe.sadd(self._fragments_key, fragment_id) self._pipe.sadd('{}:gp'.format(self._fragment_key), *effective_gp) mapping = {str(k): str(k) for k in action.request.variable_labels} mapping.update({str(k): str(k) for k in self._filter_mapping}) else: fragment_id, mapping = fragment_mapping self._fragment_key = self.__f_key_pattern.format(fragment_id) # Remove the sync state if the fragment is on-demand mode if r.get('{}:on_demand'.format(self._fragment_key)) is not None: self._pipe.delete('{}:sync'.format(self._fragment_key)) # Here the following is persisted: mapping, pref_labels, fragment-request links and the original graph_pattern self._pipe.hmset('{}map'.format(self._request_key), mapping) if action.request.preferred_labels: self._pipe.sadd('{}pl'.format(self._request_key), *action.request.preferred_labels) self._pipe.sadd('{}:requests'.format(self._fragment_key), self._request_id) self._pipe.hset('{}'.format(self._request_key), 'fragment_id', fragment_id) self._pipe.sadd('{}gp'.format(self._request_key), *self._graph_pattern) self._pipe.hset('{}'.format(self._request_key), 'pattern', ' . '.join(self._graph_pattern)) # Populate attributes that may be required during the rest of the submission process self._dict_fields['mapping'] = mapping self._dict_fields['preferred_labels'] = action.request.preferred_labels self._dict_fields['fragment_id'] = fragment_id if not exists: log.info('Request {} has started a new fragment collection: {}'.format(self._request_id, fragment_id)) else: log.info('Request {} is going to re-use fragment {}'.format(self._request_id, fragment_id)) n_fragment_reqs = r.scard('{}:requests'.format(self._fragment_key)) log.info('Fragment {} is supporting {} more requests'.format(fragment_id, n_fragment_reqs))
def __collect_fragments(): registered_fragments = r.scard(fragments_key) synced_fragments = len(r.keys('{}:*:sync'.format(fragments_key))) log.info("""Collector daemon started: - Fragments: {} - Synced: {}""".format(registered_fragments, synced_fragments)) futures = {} while True: for fid in filter( lambda x: r.get('{}:{}:sync'.format(fragments_key, x)) is None and r.get( '{}:{}:pulling'.format(fragments_key, x)) is None, r.smembers(fragments_key)): if fid in futures: if futures[fid].done(): del futures[fid] if fid not in futures: futures[fid] = thp.submit(__pull_fragment, fid) time.sleep(1)
def __pull_fragment(fid): """ Pull and replace (if needed) a given fragment :param fid: Fragment id """ fragment_key = '{}:{}'.format(fragments_key, fid) # Load fragment graph pattern tps = r.smembers('{}:gp'.format(fragment_key)) # Load fragment requests (including their sinks) r_sinks = __load_fragment_requests(fid) log.info("""Starting collection of fragment {}: - GP: {} - Supporting: ({}) {}""".format(fid, list(tps), len(r_sinks), list(r_sinks))) # Prepare the corresponding fragment generator and fetch the search plan start_time = datetime.now() try: fgm_gen, _, graph = agora_client.get_fragment_generator('{ %s }' % ' . '.join(tps), workers=N_COLLECTORS, provider=graph_provider, queue_size=N_COLLECTORS) except Exception: log.error('Agora is not available') return # In case there is not SearchTree in the plan: notify, remove and abort collection if not list(graph.subjects(RDF.type, AGORA.SearchTree)): log.info('There is no search plan for fragment {}. Removing...'.format(fid)) # TODO: Send additional headers notifying the reason to end __notify_completion(fid, r_sinks) __remove_fragment(fid) return # Update cache graph prefixes __bind_prefixes(graph) # Extract triple patterns' dictionary from the search plan context_tp = {tpn: __extract_tp_from_plan(graph, tpn) for tpn in graph.subjects(RDF.type, AGORA.TriplePattern)} frag_contexts = {tpn: (fid, context_tp[tpn]) for tpn in context_tp} lock = fragment_lock(fid) lock.acquire() # Update fragment contexts with r.pipeline(transaction=True) as p: p.multi() p.set('{}:pulling'.format(fragment_key), True) contexts_key = '{}:contexts'.format(fragment_key) p.delete(contexts_key) for tpn in context_tp.keys(): p.sadd(contexts_key, frag_contexts[tpn]) p.execute() lock.release() # Init fragment collection counters n_triples = 0 fragment_weight = 0 fragment_delta = 0 log.info('Collecting fragment {}...'.format(fid)) try: # Iterate all fragment triples and their contexts for (c, s, p, o) in fgm_gen: pre_ts = datetime.now() # Update weights and counters triple_weight = len(u'{}{}{}'.format(s, p, o)) fragment_weight += triple_weight fragment_delta += triple_weight # Store the triple if it was not obtained before and notify related requests try: lock.acquire() new_triple = add_stream_triple(fid, context_tp[c], (s, p, o)) lock.release() if new_triple: __consume_quad(fid, (context_tp[c], s, p, o), graph, sinks=r_sinks) n_triples += 1 except Exception, e: log.warning(e.message) traceback.print_exc() if fragment_delta > 10000: fragment_delta = 0 log.info('Pulling fragment {} [{} kB]'.format(fid, fragment_weight / 1000.0)) if n_triples % 100 == 0: # Update fragment requests if r.scard('{}:requests'.format(fragment_key)) != len(r_sinks): r_sinks = __load_fragment_requests(fid) post_ts = datetime.now() elapsed = (post_ts - pre_ts).total_seconds() throttling = THROTTLING_TIME - elapsed if throttling > 0: sleep(throttling) except Exception, e: log.warning(e.message) traceback.print_exc()
def __pull_fragment(fid): """ Pull and replace (if needed) a given fragment :param fid: Fragment id """ fragment_key = '{}:{}'.format(fragments_key, fid) on_events = r.get('{}:events'.format(fragment_key)) if on_events == 'True' and not change_in_fragment_resource(fid, int(r.get('{}:ud'.format(fragment_key)))): with r.pipeline(transaction=True) as p: p.multi() sync_key = '{}:sync'.format(fragment_key) p.set(sync_key, True) durability = int(r.get('{}:ud'.format(fragment_key))) p.expire(sync_key, durability) p.set('{}:updated'.format(fragment_key), calendar.timegm(dt.utcnow().timetuple())) p.delete('{}:pulling'.format(fragment_key)) p.execute() return # Load fragment graph pattern tps = r.smembers('{}:gp'.format(fragment_key)) # Load fragment requests (including their sinks) r_sinks = __load_fragment_requests(fid) log.info("""Starting collection of fragment {}: - GP: {} - Supporting: ({}) {}""".format(fid, list(tps), len(r_sinks), list(r_sinks))) init_fragment_resources(fid) # Prepare the corresponding fragment generator and fetch the search plan start_time = datetime.utcnow() try: fgm_gen, _, graph = agora_client.get_fragment_generator('{ %s }' % ' . '.join(tps), workers=N_COLLECTORS, provider=graph_provider, queue_size=N_COLLECTORS*100) except Exception: traceback.print_exc() log.error('Agora is not available') return # In case there is not SearchTree in the plan: notify, remove and abort collection if not list(graph.subjects(RDF.type, AGORA.SearchTree)): log.info('There is no search plan for fragment {}. Removing...'.format(fid)) # TODO: Send additional headers notifying the reason to end __notify_completion(fid, r_sinks) __remove_fragment(fid) return # Update cache graph prefixes __bind_prefixes(graph) # Extract triple patterns' dictionary from the search plan context_tp = {tpn: __extract_tp_from_plan(graph, tpn) for tpn in graph.subjects(RDF.type, AGORA.TriplePattern)} frag_contexts = {tpn: (fid, context_tp[tpn]) for tpn in context_tp} lock = fragment_lock(fid) lock.acquire() # Update fragment contexts with r.pipeline(transaction=True) as p: p.multi() p.set('{}:pulling'.format(fragment_key), True) contexts_key = '{}:contexts'.format(fragment_key) p.delete(contexts_key) clear_fragment_stream(fid) for tpn in context_tp.keys(): p.sadd(contexts_key, frag_contexts[tpn]) p.execute() lock.release() # Init fragment collection counters n_triples = 0 fragment_weight = 0 fragment_delta = 0 log.info('Collecting fragment {}...'.format(fid)) try: # Iterate all fragment triples and their contexts pre_ts = datetime.utcnow() for (c, s, p, o) in fgm_gen: # Update weights and counters triple_weight = len(u'{}{}{}'.format(s, p, o)) fragment_weight += triple_weight fragment_delta += triple_weight # Store the triple if it was not obtained before and notify related requests try: lock.acquire() new_triple = add_stream_triple(fid, context_tp[c], (s, p, o)) lock.release() if new_triple: if isinstance(s, URIRef): if s not in resource_in_fragment: resource_in_fragment[s] = set([]) resource_in_fragment[s].add(fid) fragment_resources[fid].add(s) __consume_quad(fid, (context_tp[c], s, p, o), graph, sinks=r_sinks) n_triples += 1 except Exception as e: log.warning(e.message) traceback.print_exc() if fragment_delta > 10000: fragment_delta = 0 log.info('Pulling fragment {} [{} kB]'.format(fid, fragment_weight / 1000.0)) if n_triples % 100 == 0: # Update fragment requests if r.scard('{}:requests'.format(fragment_key)) != len(r_sinks): r_sinks = __load_fragment_requests(fid) post_ts = datetime.utcnow() elapsed = (post_ts - pre_ts).total_seconds() throttling = THROTTLING_TIME - elapsed if throttling > 0: sleep(throttling) pre_ts = datetime.utcnow() except Exception as e: log.warning(e.message) traceback.print_exc() elapsed = (datetime.utcnow() - start_time).total_seconds() log.info( '{} triples retrieved for fragment {} in {} s [{} kB]'.format(n_triples, fid, elapsed, fragment_weight / 1000.0)) # Update fragment cache and its contexts lock.acquire() try: __update_fragment_cache(fid, tps) log.info('Fragment {} data has been replaced with the recently collected'.format(fid)) __cache_plan_context(fid, graph) log.info('BGP context of fragment {} has been cached'.format(fid)) log.info('Updating result set for fragment {}...'.format(fid)) # Calculate sync times and update fragment flags with r.pipeline(transaction=True) as p: p.multi() sync_key = '{}:sync'.format(fragment_key) demand_key = '{}:on_demand'.format(fragment_key) # Fragment is now synced p.set(sync_key, True) # If the fragment collection time has not exceeded the threshold, switch to on-demand mode # if elapsed < ON_DEMAND_TH and elapsed * random.random() < ON_DEMAND_TH / 4: # p.set(demand_key, True) # log.info('Fragment {} has been switched to on-demand mode'.format(fid)) # else: p.delete(demand_key) updated_delay = int(r.get('{}:ud'.format(fragment_key))) last_requests_ts = map(lambda x: int(x), r.lrange('{}:hist'.format(fragment_key), 0, -1)) print last_requests_ts current_ts = calendar.timegm(datetime.utcnow().timetuple()) first_collection = r.get('{}:updated'.format(fragment_key)) is None base_ts = last_requests_ts[:] if not first_collection: if current_ts - base_ts[0] <= updated_delay: current_ts += updated_delay # Force base_ts = [current_ts] + base_ts request_intervals = [i - j for i, j in zip(base_ts[:-1], base_ts[1:])] if request_intervals: avg_gap = reduce(lambda x, y: x + y, request_intervals) / len(request_intervals) print avg_gap, durability = avg_gap - elapsed if avg_gap > updated_delay else updated_delay - elapsed else: durability = updated_delay - elapsed durability = int(max(durability, 1)) print durability if durability <= updated_delay - elapsed: p.expire(sync_key, durability) log.info('Fragment {} is considered synced for {} s'.format(fid, durability)) else: clear_fragment_stream(fid) p.delete('{}:updated'.format(fragment_key)) p.delete('{}:hist'.format(fragment_key)) log.info('Fragment {} will no longer be automatically updated'.format(fid)) p.set('{}:updated'.format(fragment_key), calendar.timegm(dt.utcnow().timetuple())) p.delete('{}:pulling'.format(fragment_key)) p.execute() __notify_completion(fid, r_sinks) finally: lock.release() log.info('Fragment {} collection is complete!'.format(fid))
def _save(self, action, general=True): """ Stores data relating to the recovery of a fragment for this request """ super(FragmentSink, self)._save(action) # Override general parameter general = general and action.request.allow_generalisation # Fragment collection parameters requested_updating_delay = action.request.updating_delay if action.request.updating_delay is None: requested_updating_delay = MIN_SYNC_TIME self._pipe.hset(self._request_key, 'updating_delay', requested_updating_delay) self._pipe.hset(self._request_key, 'allow_generalisation', action.request.allow_generalisation) # Recover pattern from the request object self._graph_pattern = action.request.pattern effective_gp = self._generalize_gp() if general else self._graph_pattern # fragment_mapping is a tuple like (fragment_id, mapping) fragment_mapping = self.__check_gp_mappings(gp=effective_gp) exists = fragment_mapping is not None # Decide to proceed depending on whether it's the first time this request is received and the fragment # is already known proceed = action.id in self.passed_requests or ( random() > 1.0 - PASS_THRESHOLD if not exists else random() > PASS_THRESHOLD) if not proceed: self.do_pass(action) if action.id in self.passed_requests: self.passed_requests.remove(action.id) lock = None try: if not exists: # If there is no mapping, register a new fragment collection for the general graph pattern fragment_id = str(uuid()) self._fragment_key = self.__f_key_pattern.format(fragment_id) self._pipe.sadd(self._fragments_key, fragment_id) self._pipe.sadd('{}:gp'.format(self._fragment_key), *effective_gp) mapping = {str(k): str(k) for k in action.request.variable_labels} mapping.update({str(k): str(k) for k in self._filter_mapping}) else: fragment_id, mapping = fragment_mapping self._fragment_key = self.__f_key_pattern.format(fragment_id) lock = fragment_lock(fragment_id) lock.acquire() # Remove the sync state if the fragment is on-demand mode if r.get('{}:on_demand'.format(self._fragment_key)) is not None: self._pipe.delete('{}:sync'.format(self._fragment_key)) # Here the following is persisted: mapping, pref_labels, fragment-request links and the original # graph_pattern self._pipe.hmset('{}map'.format(self._request_key), mapping) if action.request.preferred_labels: self._pipe.sadd('{}pl'.format(self._request_key), *action.request.preferred_labels) self._pipe.sadd('{}:requests'.format(self._fragment_key), self._request_id) self._pipe.hset(self._request_key, 'fragment_id', fragment_id) self._pipe.sadd('{}gp'.format(self._request_key), *self._graph_pattern) self._pipe.hset(self._request_key, 'pattern', ' . '.join(self._graph_pattern)) # Update collection parameters fragment_synced = True current_updated = r.get('{}:updated'.format(self._fragment_key)) if current_updated is not None: current_updated = dt.utcfromtimestamp(float(current_updated)) utcnow = dt.utcnow() limit = utcnow - delta(seconds=requested_updating_delay) if limit > current_updated: diff = (limit - current_updated).total_seconds() self._pipe.delete('{}:sync'.format(self._fragment_key)) fragment_synced = False # if diff > requested_updating_delay / 2.0: # self._pipe.delete('{}:updated'.format(self._fragment_key)) current_updating_delay = int( r.get('{}:ud'.format(self._fragment_key))) if exists and fragment_synced else sys.maxint if current_updating_delay > requested_updating_delay: self._pipe.set('{}:ud'.format(self._fragment_key), requested_updating_delay) current_on_events = r.get('{}:events'.format(self._fragment_key)) requested_on_events = action.request.update_on_events if current_on_events is None or (current_on_events is not None and current_on_events == 'True'): self._pipe.set('{}:events'.format(self._fragment_key), requested_on_events) # Update fragment request history # if not fragment_synced: # self._pipe.delete('{}:hist'.format(self._fragment_key)) self._pipe.lpush('{}:hist'.format(self._fragment_key), calendar.timegm(datetime.utcnow().timetuple())) self._pipe.ltrim('{}:hist'.format(self._fragment_key), 0, 3) # Populate attributes that may be required during the rest of the submission process self._dict_fields['mapping'] = mapping self._dict_fields['preferred_labels'] = action.request.preferred_labels self._dict_fields['fragment_id'] = fragment_id if not exists: _log.info('Request {} has started a new fragment collection: {}'.format(self._request_id, fragment_id)) else: _log.info('Request {} is going to re-use fragment {}'.format(self._request_id, fragment_id)) n_fragment_reqs = r.scard('{}:requests'.format(self._fragment_key)) _log.info('Fragment {} is supporting {} more requests'.format(fragment_id, n_fragment_reqs)) finally: if lock is not None: lock.release()
for rid in sent: r.srem(__ready_key, rid) r.srem(__deliveries_key, rid) try: response = build_response(rid) response.sink.remove() # Its lock is removed too __log.info('Request {} was sent and cleared'.format(rid)) except AttributeError: traceback.print_exc() __log.warning('Request number {} was deleted by other means'.format(rid)) pass r.srem(__sent_key, rid) except Exception as e: __log.error(e.message) traceback.print_exc() finally: time.sleep(0.1) # Log delivery counters at startup __registered_deliveries = r.scard(__deliveries_key) __deliveries_ready = r.scard(__ready_key) __log.info("""Delivery daemon started: - Deliveries: {} - Ready: {}""".format(__registered_deliveries, __deliveries_ready)) # Create and start delivery daemon __thread = Thread(target=__deliver_responses) __thread.daemon = True __thread.start()
lock.acquire() try: with r.pipeline(transaction=True) as p: p.multi() sync_key = '{}:sync'.format(fragment_key) # Fragment is now synced p.set(sync_key, True) min_durability = int(MIN_SYNC) durability = random.randint(min_durability, min_durability * 2) p.expire(sync_key, durability) log.info('Fragment {} is considered synced for {} s'.format(fid, durability)) p.set('{}:updated'.format(fragment_key), dt.now()) p.delete('{}:pulling'.format(fragment_key)) p.execute() if r.scard('{}:requests'.format(fragment_key)) != len(r_sinks): r_sinks = __load_fragment_requests(fid) __notify_completion(r_sinks) finally: lock.release() def __collect_fragments(): registered_fragments = r.scard(fragments_key) synced_fragments = len(r.keys('{}:*:sync'.format(fragments_key))) log.info("""Collector daemon started: - Fragments: {} - Synced: {}""".format(registered_fragments, synced_fragments)) futures = {} while True: