Exemple #1
0
def __update_fragment_cache(fid, gp):
    """
    Recreate fragment <fid> cached data and all its data-contexts from the corresponding stream (Redis)
    :param fid:
    :return:
    """
    fragments_cache.remove_context(fragments_cache.get_context('/' + fid))

    gp_graph = graph_from_gp(gp)
    roots = filter(lambda x: gp_graph.in_degree(x) == 0, gp_graph.nodes())

    fragment_triples = load_stream_triples(fid, calendar.timegm(dt.utcnow().timetuple()))
    visited_contexts = set([])
    for c, s, p, o in fragment_triples:
        if c not in visited_contexts:
            fragments_cache.remove_context(fragments_cache.get_context(str((fid, c))))
            visited_contexts.add(c)
        fragments_cache.get_context(str((fid, c))).add((s, p, o))
        fragments_cache.get_context('/' + fid).add((s, p, o))
        if c[0] in roots:
            fragments_cache.get_context('/' + fid).add((s, RDF.type, STOA.Root))
    visited_contexts.clear()
    with r.pipeline() as pipe:
        pipe.delete('{}:{}:stream'.format(fragments_key, fid))
        pipe.execute()
    def create(self, conjunctive=False, gid=None, loader=None, format=None):
        self.__lock.acquire()
        uuid_lock = None
        cached = False
        p = r.pipeline(transaction=True)
        p.multi()

        try:
            uuid = shortuuid.uuid()

            if conjunctive:
                if 'persist' in app.config['STORE']:
                    g = ConjunctiveGraph('Sleepycat')
                    g.open('store/resources/{}'.format(uuid), create=True)
                else:
                    g = ConjunctiveGraph()
                g.store.graph_aware = False
                self.__graph_dict[g] = uuid
                self.__uuid_dict[uuid] = g
                return g
            else:
                g = resources_cache.get_context(uuid)
                try:
                    if gid is not None:
                        st_uuid = r.hget(self.__gids_key, gid)
                        if st_uuid is not None:
                            cached = True
                            uuid = st_uuid
                            uuid_lock = self.uuid_lock(uuid)
                            uuid_lock.acquire()
                            g = self.__uuid_dict[uuid]
                            uuid_lock.release()
                        else:
                            post_ts = dt.now()
                            elapsed = (post_ts - self.__last_creation_ts).total_seconds()
                            throttling = (1.0 / GRAPH_THROTTLING) - elapsed
                            if throttling > 0:
                                sleep(throttling)

                        temp_key = '{}:cache:{}'.format(AGENT_ID, uuid)
                        counter_key = '{}:cnt'.format(temp_key)
                        ttl = MIN_CACHE_TIME + int(10 * random())
                        ttl_ts = calendar.timegm((dt.now() + datetime.timedelta(ttl)).timetuple())

                        if st_uuid is None:
                            p.delete(counter_key)
                            p.sadd(self.__cache_key, uuid)
                            p.hset(self.__gids_key, uuid, gid)
                            p.hset(self.__gids_key, gid, uuid)
                            self.__last_creation_ts = dt.now()
                        p.incr(counter_key)
                        p.set(temp_key, ttl_ts)
                        p.expire(temp_key, ttl)
                        uuid_lock = self.uuid_lock(uuid)
                        uuid_lock.acquire()
                except Exception, e:
                    log.error(e.message)
                    traceback.print_exc()
            self.__graph_dict[g] = uuid
            self.__uuid_dict[uuid] = g
    def __purge(self):
        while True:
            self.__lock.acquire()
            try:
                obsolete = filter(lambda x: not r.exists('{}:cache:{}'.format(AGENT_ID, x)),
                                  r.smembers(self.__cache_key))

                if obsolete:
                    with r.pipeline(transaction=True) as p:
                        p.multi()
                        log.info('Removing {} resouces from cache...'.format(len(obsolete)))
                        for uuid in obsolete:
                            uuid_lock = self.uuid_lock(uuid)
                            uuid_lock.acquire()
                            try:
                                gid = r.hget(self.__gids_key, uuid)
                                counter_key = '{}:cache:{}:cnt'.format(AGENT_ID, uuid)
                                usage_counter = r.get(counter_key)
                                if usage_counter is None or int(usage_counter) <= 0:
                                    try:
                                        resources_cache.remove_context(resources_cache.get_context(uuid))
                                        p.srem(self.__cache_key, uuid)
                                        p.hdel(self.__gids_key, uuid)
                                        p.hdel(self.__gids_key, gid)
                                        p.delete(counter_key)
                                        g = self.__uuid_dict[uuid]
                                        del self.__uuid_dict[uuid]
                                        del self.__graph_dict[g]
                                    except Exception, e:
                                        traceback.print_exc()
                                        log.error('Purging resource {} with uuid {}'.format(gid, uuid))
                                p.execute()
                            finally:
                                uuid_lock.release()
 def delivery(self, value):
     """
     Changes the delivery state of the request
     :param value: 'ready', 'sent', 'accepted', ...
     """
     with r.pipeline(transaction=True) as p:
         p.multi()
         if value == 'ready':
             p.sadd(self.__ready_key, self._request_id)
         elif value == 'sent':
             p.sadd(self.__sent_key, self._request_id)
         if value != 'ready':
             p.srem(self.__ready_key, self._request_id)
         p.hset('{}'.format(self._request_key), 'delivery', value)
         p.execute()
     log.info('Request {} delivery state is now "{}"'.format(self._request_id, value))
def __load_fragment_requests(fid):
    """
    Load all requests and their sinks that are related to a given fragment id
    :param fid: Fragment id
    :return: A dictionary of sinks of all fragment requests
    """
    sinks_ = {}
    fragment_requests_key = '{}:{}:requests'.format(fragments_key, fid)
    for rid in r.smembers(fragment_requests_key):
        try:
            sinks_[rid] = build_response(rid).sink
        except Exception, e:
            log.warning(e.message)
            with r.pipeline(transaction=True) as p:
                p.multi()
                p.srem(fragment_requests_key, rid)
                p.execute()
def _update_result_set(fid, gp):
    try:
        result_gen = _query(fid, gp)
        removed = db[fid].delete_many({}).deleted_count
        log.info('{} rows removed from fragment {} result set'.format(removed, fid))
        table = db[fid]
        rows = set(result_gen)
        if rows:
            table.insert_many([{label: row[row.labels[label]] for label in row.labels} for row in rows])
        log.info('{} rows inserted into fragment {} result set'.format(len(rows), fid))

        with r.pipeline(transaction=True) as p:
            p.multi()
            p.set('{}:{}:rs'.format(fragments_key, fid), True)
            p.execute()

    except Exception, e:
        traceback.print_exc()
        log.error(e.message)
def __remove_fragment(fid):
    """
    Completely remove a fragment from the system after notifying its known consumers
    :param fid: Fragment identifier
    """
    log.debug('Waiting to remove fragment {}...'.format(fid))
    lock = fragment_lock(fid)
    lock.acquire()

    r_sinks = __load_fragment_requests(fid)
    __notify_completion(fid, r_sinks)
    fragment_keys = r.keys('{}:{}*'.format(fragments_key, fid))
    with r.pipeline(transaction=True) as p:
        p.multi()
        map(lambda k: p.delete(k), fragment_keys)
        p.srem(fragments_key, fid)
        p.execute()

    # Fragment lock key was just implicitly removed, so it's not necessary to release the lock
    # lock.release()
    log.info('Fragment {} has been removed'.format(fid))
def __update_fragment_cache(fid, gp):
    """
    Recreate fragment <fid> cached data and all its data-contexts from the corresponding stream (Redis)
    :param fid:
    :return:
    """
    plan_tps = fragments_cache.get_context(fid).subjects(RDF.type, AGORA.TriplePattern)
    fragments_cache.remove_context(fragments_cache.get_context('/' + fid))
    for tp in plan_tps:
        fragments_cache.remove_context(
            fragments_cache.get_context(str((fid, __extract_tp_from_plan(fragments_cache, tp)))))

    gp_graph = graph_from_gp(gp)
    roots = filter(lambda x: gp_graph.in_degree(x) == 0, gp_graph.nodes())

    fragment_triples = load_stream_triples(fid, calendar.timegm(dt.now().timetuple()))
    for c, s, p, o in fragment_triples:
        fragments_cache.get_context(str((fid, c))).add((s, p, o))
        fragments_cache.get_context('/' + fid).add((s, p, o))
        if c[0] in roots:
            fragments_cache.get_context('/' + fid).add((s, RDF.type, STOA.Root))
    with r.pipeline() as pipe:
        pipe.delete('{}:{}:stream'.format(fragments_key, fid))
        pipe.execute()
 def stream(self, value):
     with r.pipeline(transaction=True) as p:
         p.multi()
         p.hset('{}'.format(self._request_key), '__stream', value)
         p.execute()
     log.info('Request {} stream state is now "{}"'.format(self._request_id, value))
 def set_link(self, link):
     with r.pipeline(transaction=True) as p:
         p.multi()
         p.hset('{}:links:status'.format(self._enrichment_key), str(link), True)
         p.execute()
def __pull_fragment(fid):
    """
    Pull and replace (if needed) a given fragment
    :param fid: Fragment id
    """

    fragment_key = '{}:{}'.format(fragments_key, fid)

    # Load fragment graph pattern
    tps = r.smembers('{}:gp'.format(fragment_key))
    # Load fragment requests (including their sinks)
    r_sinks = __load_fragment_requests(fid)
    log.info("""Starting collection of fragment {}:
                    - GP: {}
                    - Supporting: ({}) {}""".format(fid, list(tps), len(r_sinks), list(r_sinks)))

    # Prepare the corresponding fragment generator and fetch the search plan
    start_time = datetime.now()
    try:
        fgm_gen, _, graph = agora_client.get_fragment_generator('{ %s }' % ' . '.join(tps), workers=N_COLLECTORS,
                                                                provider=graph_provider, queue_size=N_COLLECTORS)
    except Exception:
        log.error('Agora is not available')
        return

    # In case there is not SearchTree in the plan: notify, remove and abort collection
    if not list(graph.subjects(RDF.type, AGORA.SearchTree)):
        log.info('There is no search plan for fragment {}. Removing...'.format(fid))
        # TODO: Send additional headers notifying the reason to end
        __notify_completion(fid, r_sinks)
        __remove_fragment(fid)
        return

    # Update cache graph prefixes
    __bind_prefixes(graph)

    # Extract triple patterns' dictionary from the search plan
    context_tp = {tpn: __extract_tp_from_plan(graph, tpn) for tpn in
                  graph.subjects(RDF.type, AGORA.TriplePattern)}
    frag_contexts = {tpn: (fid, context_tp[tpn]) for tpn in context_tp}

    lock = fragment_lock(fid)
    lock.acquire()

    # Update fragment contexts
    with r.pipeline(transaction=True) as p:
        p.multi()
        p.set('{}:pulling'.format(fragment_key), True)
        contexts_key = '{}:contexts'.format(fragment_key)
        p.delete(contexts_key)
        for tpn in context_tp.keys():
            p.sadd(contexts_key, frag_contexts[tpn])
        p.execute()
    lock.release()

    # Init fragment collection counters
    n_triples = 0
    fragment_weight = 0
    fragment_delta = 0

    log.info('Collecting fragment {}...'.format(fid))
    try:
        # Iterate all fragment triples and their contexts
        for (c, s, p, o) in fgm_gen:
            pre_ts = datetime.now()
            # Update weights and counters
            triple_weight = len(u'{}{}{}'.format(s, p, o))
            fragment_weight += triple_weight
            fragment_delta += triple_weight

            # Store the triple if it was not obtained before and notify related requests
            try:
                lock.acquire()
                new_triple = add_stream_triple(fid, context_tp[c], (s, p, o))
                lock.release()
                if new_triple:
                    __consume_quad(fid, (context_tp[c], s, p, o), graph, sinks=r_sinks)
                n_triples += 1
            except Exception, e:
                log.warning(e.message)
                traceback.print_exc()

            if fragment_delta > 10000:
                fragment_delta = 0
                log.info('Pulling fragment {} [{} kB]'.format(fid, fragment_weight / 1000.0))

            if n_triples % 100 == 0:
                # Update fragment requests
                if r.scard('{}:requests'.format(fragment_key)) != len(r_sinks):
                    r_sinks = __load_fragment_requests(fid)

            post_ts = datetime.now()
            elapsed = (post_ts - pre_ts).total_seconds()
            throttling = THROTTLING_TIME - elapsed
            if throttling > 0:
                sleep(throttling)
    except Exception, e:
        log.warning(e.message)
        traceback.print_exc()
    log.info(
        '{} triples retrieved for fragment {} in {} s [{} kB]'.format(n_triples, fid, elapsed,
                                                                      fragment_weight / 1000.0))

    # Update fragment cache and its contexts
    lock.acquire()
    try:
        __update_fragment_cache(fid, tps)
        log.info('Fragment {} data has been replaced with the recently collected'.format(fid))
        __cache_plan_context(fid, graph)
        log.info('BGP context of fragment {} has been cached'.format(fid))
        log.info('Updating result set for fragment {}...'.format(fid))
        # __update_result_set(fid, tps)

        # Calculate sync times and update fragment flags
        with r.pipeline(transaction=True) as p:
            p.multi()
            sync_key = '{}:sync'.format(fragment_key)
            demand_key = '{}:on_demand'.format(fragment_key)
            # Fragment is now synced
            p.set(sync_key, True)
            # If the fragment collection time has not exceeded the threshold, switch to on-demand mode
            if elapsed < ON_DEMAND_TH and elapsed * random.random() < ON_DEMAND_TH / 4:
                p.set(demand_key, True)
                log.info('Fragment {} has been switched to on-demand mode'.format(fid))
            else:
                p.delete(demand_key)
                min_durability = int(max(MIN_SYNC, elapsed))
                durability = random.randint(min_durability, min_durability * 2)
                p.expire(sync_key, durability)
                log.info('Fragment {} is considered synced for {} s'.format(fid, durability))
Exemple #13
0
def __pull_fragment(fid):
    """
    Pull and replace (if needed) a given fragment
    :param fid: Fragment id
    """

    fragment_key = '{}:{}'.format(fragments_key, fid)
    on_events = r.get('{}:events'.format(fragment_key))

    if on_events == 'True' and not change_in_fragment_resource(fid, int(r.get('{}:ud'.format(fragment_key)))):
        with r.pipeline(transaction=True) as p:
            p.multi()
            sync_key = '{}:sync'.format(fragment_key)
            p.set(sync_key, True)
            durability = int(r.get('{}:ud'.format(fragment_key)))
            p.expire(sync_key, durability)
            p.set('{}:updated'.format(fragment_key), calendar.timegm(dt.utcnow().timetuple()))
            p.delete('{}:pulling'.format(fragment_key))
            p.execute()
        return

    # Load fragment graph pattern
    tps = r.smembers('{}:gp'.format(fragment_key))
    # Load fragment requests (including their sinks)
    r_sinks = __load_fragment_requests(fid)
    log.info("""Starting collection of fragment {}:
                    - GP: {}
                    - Supporting: ({}) {}""".format(fid, list(tps), len(r_sinks), list(r_sinks)))

    init_fragment_resources(fid)

    # Prepare the corresponding fragment generator and fetch the search plan
    start_time = datetime.utcnow()
    try:
        fgm_gen, _, graph = agora_client.get_fragment_generator('{ %s }' % ' . '.join(tps), workers=N_COLLECTORS,
                                                                provider=graph_provider, queue_size=N_COLLECTORS*100)

    except Exception:
        traceback.print_exc()
        log.error('Agora is not available')
        return

    # In case there is not SearchTree in the plan: notify, remove and abort collection
    if not list(graph.subjects(RDF.type, AGORA.SearchTree)):
        log.info('There is no search plan for fragment {}. Removing...'.format(fid))
        # TODO: Send additional headers notifying the reason to end
        __notify_completion(fid, r_sinks)
        __remove_fragment(fid)
        return

    # Update cache graph prefixes
    __bind_prefixes(graph)

    # Extract triple patterns' dictionary from the search plan
    context_tp = {tpn: __extract_tp_from_plan(graph, tpn) for tpn in
                  graph.subjects(RDF.type, AGORA.TriplePattern)}
    frag_contexts = {tpn: (fid, context_tp[tpn]) for tpn in context_tp}

    lock = fragment_lock(fid)
    lock.acquire()

    # Update fragment contexts
    with r.pipeline(transaction=True) as p:
        p.multi()
        p.set('{}:pulling'.format(fragment_key), True)
        contexts_key = '{}:contexts'.format(fragment_key)
        p.delete(contexts_key)
        clear_fragment_stream(fid)
        for tpn in context_tp.keys():
            p.sadd(contexts_key, frag_contexts[tpn])
        p.execute()
    lock.release()

    # Init fragment collection counters
    n_triples = 0
    fragment_weight = 0
    fragment_delta = 0

    log.info('Collecting fragment {}...'.format(fid))
    try:
        # Iterate all fragment triples and their contexts
        pre_ts = datetime.utcnow()
        for (c, s, p, o) in fgm_gen:
            # Update weights and counters
            triple_weight = len(u'{}{}{}'.format(s, p, o))
            fragment_weight += triple_weight
            fragment_delta += triple_weight

            # Store the triple if it was not obtained before and notify related requests
            try:
                lock.acquire()
                new_triple = add_stream_triple(fid, context_tp[c], (s, p, o))
                lock.release()
                if new_triple:
                    if isinstance(s, URIRef):
                        if s not in resource_in_fragment:
                            resource_in_fragment[s] = set([])
                        resource_in_fragment[s].add(fid)
                        fragment_resources[fid].add(s)
                    __consume_quad(fid, (context_tp[c], s, p, o), graph, sinks=r_sinks)
                n_triples += 1
            except Exception as e:
                log.warning(e.message)
                traceback.print_exc()

            if fragment_delta > 10000:
                fragment_delta = 0
                log.info('Pulling fragment {} [{} kB]'.format(fid, fragment_weight / 1000.0))

            if n_triples % 100 == 0:
                # Update fragment requests
                if r.scard('{}:requests'.format(fragment_key)) != len(r_sinks):
                    r_sinks = __load_fragment_requests(fid)

            post_ts = datetime.utcnow()
            elapsed = (post_ts - pre_ts).total_seconds()
            throttling = THROTTLING_TIME - elapsed
            if throttling > 0:
                sleep(throttling)
            pre_ts = datetime.utcnow()
    except Exception as e:
        log.warning(e.message)
        traceback.print_exc()

    elapsed = (datetime.utcnow() - start_time).total_seconds()
    log.info(
        '{} triples retrieved for fragment {} in {} s [{} kB]'.format(n_triples, fid, elapsed,
                                                                      fragment_weight / 1000.0))

    # Update fragment cache and its contexts
    lock.acquire()
    try:
        __update_fragment_cache(fid, tps)
        log.info('Fragment {} data has been replaced with the recently collected'.format(fid))
        __cache_plan_context(fid, graph)
        log.info('BGP context of fragment {} has been cached'.format(fid))
        log.info('Updating result set for fragment {}...'.format(fid))

        # Calculate sync times and update fragment flags
        with r.pipeline(transaction=True) as p:
            p.multi()
            sync_key = '{}:sync'.format(fragment_key)
            demand_key = '{}:on_demand'.format(fragment_key)
            # Fragment is now synced
            p.set(sync_key, True)
            # If the fragment collection time has not exceeded the threshold, switch to on-demand mode
            # if elapsed < ON_DEMAND_TH and elapsed * random.random() < ON_DEMAND_TH / 4:
            #     p.set(demand_key, True)
            #     log.info('Fragment {} has been switched to on-demand mode'.format(fid))
            # else:
            p.delete(demand_key)

            updated_delay = int(r.get('{}:ud'.format(fragment_key)))
            last_requests_ts = map(lambda x: int(x), r.lrange('{}:hist'.format(fragment_key), 0, -1))
            print last_requests_ts
            current_ts = calendar.timegm(datetime.utcnow().timetuple())
            first_collection = r.get('{}:updated'.format(fragment_key)) is None
            base_ts = last_requests_ts[:]
            if not first_collection:
                if current_ts - base_ts[0] <= updated_delay:
                    current_ts += updated_delay  # Force
                base_ts = [current_ts] + base_ts
            request_intervals = [i - j for i, j in zip(base_ts[:-1], base_ts[1:])]
            if request_intervals:
                avg_gap = reduce(lambda x, y: x + y, request_intervals) / len(request_intervals)
                print avg_gap,
                durability = avg_gap - elapsed if avg_gap > updated_delay else updated_delay - elapsed
            else:
                durability = updated_delay - elapsed

            durability = int(max(durability, 1))
            print durability
            if durability <= updated_delay - elapsed:
                p.expire(sync_key, durability)
                log.info('Fragment {} is considered synced for {} s'.format(fid, durability))
            else:
                clear_fragment_stream(fid)
                p.delete('{}:updated'.format(fragment_key))
                p.delete('{}:hist'.format(fragment_key))
                log.info('Fragment {} will no longer be automatically updated'.format(fid))

            p.set('{}:updated'.format(fragment_key), calendar.timegm(dt.utcnow().timetuple()))
            p.delete('{}:pulling'.format(fragment_key))
            p.execute()

        __notify_completion(fid, r_sinks)
    finally:
        lock.release()

    log.info('Fragment {} collection is complete!'.format(fid))
        c, s, p, o = eval(x)
        return c, __term(s), __term(p), __term(o)

    for x in r.zrangebyscore('{}:fragments:{}:stream'.format(AGENT_ID, fid), '-inf', '{}'.format(float(until))):
        yield __triplify(x)


def add_stream_triple(fid, tp, (s, p, o), timestamp=None):
    try:
        if timestamp is None:
            timestamp = calendar.timegm(dt.utcnow().timetuple())
        quad = (tp, s.n3(), p.n3(), o.n3())
        stream_key = '{}:fragments:{}:stream'.format(AGENT_ID, fid)
        not_found = not bool(r.zscore(stream_key, quad))
        if not_found:
            with r.pipeline() as pipe:
                pipe.zadd(stream_key, timestamp, quad)
                pipe.execute()
        return not_found
    except Exception, e:
        traceback.print_exc()
        log.error(e.message)


class GraphProvider(object):
    def __init__(self):
        self.__last_creation_ts = dt.now()
        self.__graph_dict = {}
        self.__uuid_dict = {}
        self.__gid_uuid_dict = {}
        self.__lock = Lock()