Esempio n. 1
0
    def __init__(self,
                 endpoint,
                 start_time,
                 end_time,
                 debug=False,
                 timeseries=False):
        self.endpoint = endpoint
        self.start_time = start_time
        self.end_time = end_time
        self.timeseries = timeseries
        self.debug = debug

        if start_time and end_time:
            if end_time - start_time > 6 * 30 * 24 * 3600:
                self.step = 86400
            elif end_time - start_time > 30 * 24 * 3600:
                self.step = 3600
            elif end_time - start_time > 24 * 3600:
                self.step = 300
            else:
                self.step = None

            self.intervals = IntervalSet([Interval(start_time, end_time)])
        else:
            self.intervals = IntervalSet([Interval(0, 2**32 - 1)])
Esempio n. 2
0
  def get_intervals(self):
    intervals = []
    for info in self.ceres_node.slice_info:
      (start, end, step) = info
      intervals.append( Interval(start, end) )

    return IntervalSet(intervals)
Esempio n. 3
0
 def __init__(self, pattern, startTime, endTime):
   self.pattern = pattern
   self.startTime = startTime
   self.endTime = endTime
   self.isExact = is_pattern(pattern)
   self.interval = Interval(float('-inf') if startTime is None else startTime,
                            float('inf') if endTime is None else endTime)
Esempio n. 4
0
    def get_results(self):
        if self.failed:
            return

        if self.cachedResult is not None:
            results = self.cachedResult
        else:
            if self.connection is None:
                self.send()

            try:
                try:  # Python 2.7+, use buffering of HTTP responses
                    response = self.connection.getresponse(buffering=True)
                except TypeError:  # Python 2.6 and older
                    response = self.connection.getresponse()
                assert response.status == 200, "received error response %s - %s" % (
                    response.status, response.reason)
                result_data = response.read()
                results = unpickle.loads(result_data)

            except:
                log.exception(
                    "FindRequest.get_results(host=%s, query=%s) exception processing response"
                    % (self.store.host, self.query))
                self.store.fail()
                return

            cache.set(self.cacheKey, results, settings.FIND_CACHE_DURATION)

        for node_info in results:
            # handle both 1.x and 0.9.x output
            path = node_info.get('path') or node_info.get('metric_path')
            is_leaf = node_info.get('is_leaf') or node_info.get('isLeaf')
            intervals = node_info.get('intervals') or []
            if not isinstance(intervals, IntervalSet):
                intervals = IntervalSet([
                    Interval(interval[0], interval[1])
                    for interval in intervals
                ])

            node_info = {
                'is_leaf': is_leaf,
                'path': path,
                'intervals': intervals,
            }

            if is_leaf:
                reader = RemoteReader(self.store,
                                      node_info,
                                      bulk_query=self.query.pattern)
                node = LeafNode(path, reader)
            else:
                node = BranchNode(path)

            node.local = False
            yield node
Esempio n. 5
0
    def get_intervals(self):
        log.info('===GET_INTERVALS===')
        # We have data from the beginning of the epoch :o)
        start = 1
        # We can see one hour into the future :o)
        end = int(time() + 3600)

        log.info("get_interval: start=%s; end=%s" % (start, end))

        return IntervalSet([Interval(start, end)])
Esempio n. 6
0
  def get_intervals(self):
    fh = gzip.GzipFile(self.fs_path, 'rb')
    try:
      info = whisper__readHeader(fh) # evil, but necessary.
    finally:
      fh.close()

    start = time.time() - info['maxRetention']
    end = max( stat(self.fs_path).st_mtime, start )
    return IntervalSet( [Interval(start, end)] )
Esempio n. 7
0
 def __init__(self, pattern, startTime, endTime,
              local=False, headers=None, leaves_only=None):
     self.pattern = pattern
     self.startTime = startTime
     self.endTime = endTime
     self.isExact = is_pattern(pattern)
     self.interval = Interval(
         float('-inf') if startTime is None else startTime,
         float('inf') if endTime is None else endTime)
     self.local = local
     self.headers = headers
     self.leaves_only = leaves_only
Esempio n. 8
0
    def _find_paths(self, currNodeRowKey, patterns):
        """Recursively generates absolute paths whose components underneath current_node
        match the corresponding pattern in patterns"""

        from graphite.node import BranchNode, LeafNode
        from graphite.intervals import Interval, IntervalSet

        pattern = patterns[0]
        patterns = patterns[1:]

        nodeRow = self.client.getRow(self.metaTable, currNodeRowKey, None)
        if len(nodeRow) == 0:
            return

        subnodes = {}
        for k, v in nodeRow[0].columns.items():
            if k.startswith("cf:c_"):  # branches start with c_
                key = k.split("_", 2)[1]  # pop off cf:c_ prefix
                subnodes[key] = v.value

        matching_subnodes = match_entries(subnodes.keys(), pattern)

        if patterns:  # we've still got more directories to traverse
            for subnode in matching_subnodes:
                rowKey = subnodes[subnode]
                subNodeContents = self.client.getRow(self.metaTable, rowKey,
                                                     None)

                # leafs have a cf:INFO column describing their data
                # we can't possibly match on a leaf here because we have more components in the pattern,
                # so only recurse on branches
                if "cf:INFO" not in subNodeContents[0].columns:
                    for m in self._find_paths(rowKey, patterns):
                        yield m

        else:  # at the end of the pattern
            for subnode in matching_subnodes:
                rowKey = subnodes[subnode]
                nodeRow = self.client.getRow(self.metaTable, rowKey, None)
                if len(nodeRow) == 0:
                    continue
                metric = rowKey.split("_", 2)[1]  # pop off "m_" in key
                if "cf:INFO" in nodeRow[0].columns:
                    info = json.loads(nodeRow[0].columns["cf:INFO"].value)
                    start = time.time() - info['maxRetention']
                    end = time.time()
                    intervals = IntervalSet([Interval(start, end)])
                    reader = HbaseReader(metric, intervals, info, self)
                    yield LeafNode(metric, reader)
                else:
                    yield BranchNode(metric)
Esempio n. 9
0
    def __init__(self,
                 uri,
                 metric_path,
                 start_time,
                 end_time,
                 username=None,
                 password=None,
                 debug=False):
        self.metric_path = metric_path
        self.start_time = start_time
        self.end_time = end_time
        self.username = username
        self.password = password
        self.debug = debug

        self.client = EsmondAPI(uri, debug=debug)
        if username and password:
            self.auth_client = EsmondAPI(uri,
                                         debug=debug,
                                         username=username,
                                         password=password)
        else:
            self.auth_client = self.client

        if start_time and end_time:
            if end_time - start_time > 6 * 30 * 24 * 3600:
                self.step = 86400
            elif end_time - start_time > 30 * 24 * 3600:
                self.step = 3600
            elif end_time - start_time > 24 * 3600:
                self.step = 300
            else:
                self.step = None

            self.intervals = IntervalSet([Interval(start_time, end_time)])
        else:
            self.intervals = IntervalSet([Interval(0, 2**32 - 1)])
Esempio n. 10
0
 def get_intervals(self):
   start = time.time() - whisper.info(self.fs_path)['maxRetention']
   end = max( stat(self.fs_path).st_mtime, start )
   return IntervalSet( [Interval(start, end)] )
Esempio n. 11
0
 def get_intervals(self):
     # intervals doesn't matter in such type of reader
     # Let's return time.time()
     start = time.time()
     end = start
     return IntervalSet([Interval(start, end)])
Esempio n. 12
0
 def get_intervals(self):
     return IntervalSet([Interval(time.time() - 3600, time.time())])
Esempio n. 13
0
    def find_nodes(self, query, timer=None):
        timer.set_msg('host: {host}, query: {query}'.format(host=self.host,
                                                            query=query))

        log.debug("RemoteFinder.find_nodes(host=%s, query=%s) called" %
                  (self.host, query))

        # prevent divide by 0
        cacheTTL = settings.FIND_CACHE_DURATION or 1
        if query.startTime:
            start = query.startTime - (query.startTime % cacheTTL)
        else:
            start = ""

        if query.endTime:
            end = query.endTime - (query.endTime % cacheTTL)
        else:
            end = ""

        cacheKey = "find:%s:%s:%s:%s" % (self.host, compactHash(
            query.pattern), start, end)

        results = cache.get(cacheKey)
        if results is not None:
            log.debug(
                "RemoteFinder.find_nodes(host=%s, query=%s) using cached result"
                % (self.host, query))
        else:
            url = '/metrics/find/'

            query_params = [
                ('local', self.params.get('local', '1')),
                ('format', self.params.get('format', 'pickle')),
                ('query', query.pattern),
            ]
            if query.startTime:
                query_params.append(('from', int(query.startTime)))

            if query.endTime:
                query_params.append(('until', int(query.endTime)))

            result = self.request(url,
                                  fields=query_params,
                                  headers=query.headers,
                                  timeout=settings.REMOTE_FIND_TIMEOUT)

            try:
                if result.getheader('content-type') == 'application/x-msgpack':
                    results = msgpack.load(BufferedHTTPReader(
                        result, buffer_size=settings.REMOTE_BUFFER_SIZE),
                                           encoding='utf-8')
                else:
                    results = unpickle.load(
                        BufferedHTTPReader(
                            result, buffer_size=settings.REMOTE_BUFFER_SIZE))
            except Exception as err:
                self.fail()
                log.exception(
                    "RemoteFinder[%s] Error decoding find response from %s: %s"
                    % (self.host, result.url_full, err))
                raise Exception("Error decoding find response from %s: %s" %
                                (result.url_full, err))
            finally:
                result.release_conn()

            cache.set(cacheKey, results, settings.FIND_CACHE_DURATION)

        for node_info in results:
            # handle both 1.x and 0.9.x output
            path = node_info.get('path') or node_info.get('metric_path')
            is_leaf = node_info.get('is_leaf') or node_info.get('isLeaf')
            intervals = node_info.get('intervals') or []
            if not isinstance(intervals, IntervalSet):
                intervals = IntervalSet([
                    Interval(interval[0], interval[1])
                    for interval in intervals
                ])

            node_info = {
                'is_leaf': is_leaf,
                'path': path,
                'intervals': intervals,
            }

            if is_leaf:
                reader = RemoteReader(self, node_info)
                node = LeafNode(path, reader)
            else:
                node = BranchNode(path)

            node.local = False
            yield node
Esempio n. 14
0
 def get_intervals(self):
     log.info("------->> get_intervals()")
     return IntervalSet([Interval(0, time.time())])
Esempio n. 15
0
    def find_all(self, query, headers=None):
        start = time.time()
        result_queue = Queue.Queue()
        jobs = []

        # Start remote searches
        if not query.local:
            random.shuffle(self.remote_stores)
            jobs.extend([(store.find, query, headers)
                         for store in self.remote_stores if store.available])

        # single metric query, let's hit carbon-cache first,
        # if we can fetch all data from carbon-cache, then
        # DO NOT hit disk. It helps us reduce iowait.
        # Please use the right version of carbon-cache.
        found_in_cache = False

        # Let's cache nodes with incomplete results, in case we need it and
        # don't have to query carbon-cache again.
        nodes_with_incomplete_result = {}

        for leaf_node in self.carbon_cache_finder.find_nodes(
                query, nodes_with_incomplete_result):
            yield leaf_node
            found_in_cache = True

        if found_in_cache and query.startTime != 0:
            return

        # Start local searches
        for finder in self.finders:
            jobs.append((finder.find_nodes, query))

        # Group matching nodes by their path
        nodes_by_path = defaultdict(list)

        def _work(job):
            return job[0](*job[1:])

        nodes_list = self.worker_pool.map(_work, jobs)

        for nodes in nodes_list:
            if nodes:
                for node in nodes:
                    nodes_by_path[node.path].append(node)

        # That means we should search all matched nodes.
        # it would merge nodes with new metrics that only exists in carbon-cache

        # merge any new metric node that only exists in carbon-cache,
        # although they partial exist.
        for name, node in nodes_with_incomplete_result.iteritems():
            if name not in nodes_by_path:
                nodes_by_path[name].append(node)

        log.info("Got all find results in %fs" % (time.time() - start))

        # Search Carbon Cache if nodes_by_path is empty
        #
        # We have this block of code here, because i wanna cover
        # an edge case.
        # 1) metric: carbon.foo
        # 2) carbon-cache includes 2 hours data for carbon.foo
        # 3) query data starting from 3 hours ago.
        # in such case, previous carbon_cache_finder will not return any node
        # because carbon-cache doesn't have enough data. However, if we reach
        # this point, that means we should return all we have in carbon cache.
        if not nodes_by_path:
            for name, node in nodes_with_incomplete_result.iteritems():
                # it only exists one value
                yield node
            return

        # Reduce matching nodes for each path to a minimal set
        found_branch_nodes = set()

        items = list(nodes_by_path.iteritems())
        random.shuffle(items)

        for path, nodes in items:
            leaf_nodes = []

            # First we dispense with the BranchNodes
            for node in nodes:
                if node.is_leaf:
                    leaf_nodes.append(node)
                elif node.path not in found_branch_nodes:  #TODO need to filter branch nodes based on requested interval... how?!?!?
                    yield node
                    found_branch_nodes.add(node.path)

            if not leaf_nodes:
                continue

            # Fast-path when there is a single node.
            if len(leaf_nodes) == 1:
                yield leaf_nodes[0]
                continue

            # Calculate best minimal node set
            minimal_node_set = set()
            covered_intervals = IntervalSet([])

            # If the query doesn't fall entirely within the FIND_TOLERANCE window
            # we disregard the window. This prevents unnecessary remote fetches
            # caused when carbon's cache skews node.intervals, giving the appearance
            # remote systems have data we don't have locally, which we probably do.
            now = int(time.time())
            tolerance_window = now - settings.FIND_TOLERANCE
            disregard_tolerance_window = query.interval.start < tolerance_window
            prior_to_window = Interval(float('-inf'), tolerance_window)

            def measure_of_added_coverage(
                    node, drop_window=disregard_tolerance_window):
                relevant_intervals = node.intervals.intersect_interval(
                    query.interval)
                if drop_window:
                    relevant_intervals = relevant_intervals.intersect_interval(
                        prior_to_window)
                return covered_intervals.union(
                    relevant_intervals).size - covered_intervals.size

            nodes_remaining = list(leaf_nodes)

            # Prefer local nodes first (and do *not* drop the tolerance window)
            for node in leaf_nodes:
                if node.local and measure_of_added_coverage(node, False) > 0:
                    nodes_remaining.remove(node)
                    minimal_node_set.add(node)
                    covered_intervals = covered_intervals.union(node.intervals)

            if settings.REMOTE_STORE_MERGE_RESULTS:
                remote_nodes = [n for n in nodes_remaining if not n.local]
                for node in remote_nodes:
                    nodes_remaining.remove(node)
                    minimal_node_set.add(node)
                    covered_intervals = covered_intervals.union(node.intervals)
            else:
                while nodes_remaining:
                    node_coverages = [(measure_of_added_coverage(n), n)
                                      for n in nodes_remaining]
                    best_coverage, best_node = max(node_coverages)

                    if best_coverage == 0:
                        break

                    nodes_remaining.remove(best_node)
                    minimal_node_set.add(best_node)
                    covered_intervals = covered_intervals.union(
                        best_node.intervals)

                # Sometimes the requested interval falls within the caching window.
                # We include the most likely node if the gap is within tolerance.
                if not minimal_node_set:

                    def distance_to_requested_interval(node):
                        if not node.intervals:
                            return float('inf')
                        latest = sorted(node.intervals,
                                        key=lambda i: i.end)[-1]
                        distance = query.interval.start - latest.end
                        return distance if distance >= 0 else float('inf')

                    best_candidate = min(leaf_nodes,
                                         key=distance_to_requested_interval)
                    if distance_to_requested_interval(
                            best_candidate) <= settings.FIND_TOLERANCE:
                        minimal_node_set.add(best_candidate)

            if len(minimal_node_set) == 1:
                yield minimal_node_set.pop()
            elif len(minimal_node_set) > 1:
                reader = MultiReader(minimal_node_set)
                yield LeafNode(path, reader)
Esempio n. 16
0
 def get_intervals(self):
   start = time.time() - self.get_retention(self.fs_path)
   end = max( stat(self.fs_path).st_mtime, start )
   return IntervalSet( [Interval(start, end)] )
Esempio n. 17
0
    def find_nodes(self, query, timer=None):
        timer.set_msg('host: {host}, query: {query}'.format(host=self.host,
                                                            query=query))

        log.debug("RemoteFinder.find_nodes(host=%s, query=%s) called" %
                  (self.host, query))

        # prevent divide by 0
        cacheTTL = settings.FIND_CACHE_DURATION or 1
        if query.startTime:
            start = query.startTime - (query.startTime % cacheTTL)
        else:
            start = ""

        if query.endTime:
            end = query.endTime - (query.endTime % cacheTTL)
        else:
            end = ""

        cacheKey = "find:%s:%s:%s:%s" % (self.host, compactHash(
            query.pattern), start, end)

        results = cache.get(cacheKey)
        if results is not None:
            log.debug(
                "RemoteFinder.find_nodes(host=%s, query=%s) using cached result"
                % (self.host, query))
        else:
            url = '/metrics/find/'

            query_params = [
                ('local', self.params.get('local', '1')),
                ('format', self.params.get('format', 'pickle')),
                ('query', query.pattern),
            ]
            if query.startTime:
                query_params.append(('from', int(query.startTime)))

            if query.endTime:
                query_params.append(('until', int(query.endTime)))

            result = self.request(url,
                                  fields=query_params,
                                  headers=query.headers,
                                  timeout=settings.FIND_TIMEOUT)

            try:
                if result.getheader('content-type') == 'application/x-msgpack':
                    results = msgpack.load(BufferedHTTPReader(
                        result, buffer_size=settings.REMOTE_BUFFER_SIZE),
                                           encoding='utf-8')
                else:
                    results = unpickle.load(
                        BufferedHTTPReader(
                            result, buffer_size=settings.REMOTE_BUFFER_SIZE))
            except Exception as err:
                self.fail()
                log.exception(
                    "RemoteFinder[%s] Error decoding find response from %s: %s"
                    % (self.host, result.url_full, err))
                raise Exception("Error decoding find response from %s: %s" %
                                (result.url_full, err))
            finally:
                result.release_conn()

            cache.set(cacheKey, results, settings.FIND_CACHE_DURATION)

        # We don't use generator here, this function may be run as a job in a thread pool, using a generator has the following risks:
        # 1. Generators are lazy, if we don't iterator the returned generator in the job, the real execution(network operations,
        #    time-consuming) are very likely be triggered in the calling thread, losing the effect of thread pool;
        # 2. As function execution is delayed, the job manager can not catch job runtime exception as expected/designed;
        nodes = []
        for node_info in results:
            # handle both 1.x and 0.9.x output
            path = node_info.get('path') or node_info.get('metric_path')
            is_leaf = node_info.get('is_leaf') or node_info.get('isLeaf')
            intervals = node_info.get('intervals') or []
            if not isinstance(intervals, IntervalSet):
                intervals = IntervalSet([
                    Interval(interval[0], interval[1])
                    for interval in intervals
                ])

            node_info = {
                'is_leaf': is_leaf,
                'path': path,
                'intervals': intervals,
            }

            if is_leaf:
                reader = RemoteReader(self, node_info)
                node = LeafNode(path, reader)
            else:
                node = BranchNode(path)

            node.local = False
            nodes.append(node)

        return nodes
Esempio n. 18
0
    def send(self, headers=None, msg_setter=None):
        log.debug("FindRequest.send(host=%s, query=%s) called" %
                  (self.store.host, self.query))

        if headers is None:
            headers = {}

        results = cache.get(self.cacheKey)
        if results is not None:
            log.debug(
                "FindRequest.send(host=%s, query=%s) using cached result" %
                (self.store.host, self.query))
        else:
            url = "%s://%s/metrics/find/" % (
                'https' if settings.INTRACLUSTER_HTTPS else 'http',
                self.store.host)

            query_params = [
                ('local', '1'),
                ('format', 'pickle'),
                ('query', self.query.pattern),
            ]
            if self.query.startTime:
                query_params.append(('from', self.query.startTime))

            if self.query.endTime:
                query_params.append(('until', self.query.endTime))

            try:
                result = http.request(
                    'POST' if settings.REMOTE_STORE_USE_POST else 'GET',
                    url,
                    fields=query_params,
                    headers=headers,
                    timeout=settings.REMOTE_FIND_TIMEOUT)
            except BaseException:
                log.exception(
                    "FindRequest.send(host=%s, query=%s) exception during request"
                    % (self.store.host, self.query))
                self.store.fail()
                return

            if result.status != 200:
                log.exception(
                    "FindRequest.send(host=%s, query=%s) error response %d from %s?%s"
                    % (self.store.host, self.query, result.status, url,
                       urlencode(query_params)))
                self.store.fail()
                return

            try:
                results = unpickle.loads(result.data)
            except BaseException:
                log.exception(
                    "FindRequest.send(host=%s, query=%s) exception processing response"
                    % (self.store.host, self.query))
                self.store.fail()
                return

            cache.set(self.cacheKey, results, settings.FIND_CACHE_DURATION)

        msg_setter('host: {host}, query: {query}'.format(host=self.store.host,
                                                         query=self.query))

        for node_info in results:
            # handle both 1.x and 0.9.x output
            path = node_info.get('path') or node_info.get('metric_path')
            is_leaf = node_info.get('is_leaf') or node_info.get('isLeaf')
            intervals = node_info.get('intervals') or []
            if not isinstance(intervals, IntervalSet):
                intervals = IntervalSet([
                    Interval(interval[0], interval[1])
                    for interval in intervals
                ])

            node_info = {
                'is_leaf': is_leaf,
                'path': path,
                'intervals': intervals,
            }

            if is_leaf:
                reader = RemoteReader(self.store,
                                      node_info,
                                      bulk_query=[self.query.pattern])
                node = LeafNode(path, reader)
            else:
                node = BranchNode(path)

            node.local = False
            yield node
Esempio n. 19
0
  def find(self, pattern, startTime=None, endTime=None, local=False):
    query = FindQuery(pattern, startTime, endTime)

    # Start remote searches
    if not local:
      remote_requests = [ r.find(query) for r in self.remote_stores if r.available ]

    matching_nodes = set()

    # Search locally
    for finder in self.finders:
      for node in finder.find_nodes(query):
        #log.info("find() :: local :: %s" % node)
        matching_nodes.add(node)

    # Gather remote search results
    if not local:
      for request in remote_requests:
        for node in request.get_results():
          #log.info("find() :: remote :: %s from %s" % (node,request.store.host))
          matching_nodes.add(node)

    # Group matching nodes by their path
    nodes_by_path = {}
    for node in matching_nodes:
      if node.path not in nodes_by_path:
        nodes_by_path[node.path] = []

      nodes_by_path[node.path].append(node)

    # Reduce matching nodes for each path to a minimal set
    found_branch_nodes = set()

    for path, nodes in nodes_by_path.iteritems():
      leaf_nodes = []

      # First we dispense with the BranchNodes
      for node in nodes:
        if node.is_leaf:
          leaf_nodes.append(node)
        elif node.path not in found_branch_nodes: #TODO need to filter branch nodes based on requested interval... how?!?!?
          yield node
          found_branch_nodes.add(node.path)

      if not leaf_nodes:
        continue

      # Calculate best minimal node set
      minimal_node_set = set()
      covered_intervals = IntervalSet([])

      # If the query doesn't fall entirely within the FIND_TOLERANCE window
      # we disregard the window. This prevents unnecessary remote fetches
      # caused when carbon's cache skews node.intervals, giving the appearance
      # remote systems have data we don't have locally, which we probably do.
      now = int( time.time() )
      tolerance_window = now - settings.FIND_TOLERANCE
      disregard_tolerance_window = query.interval.start < tolerance_window
      prior_to_window = Interval( float('-inf'), tolerance_window )

      def measure_of_added_coverage(node, drop_window=disregard_tolerance_window):
        relevant_intervals = node.intervals.intersect_interval(query.interval)
        if drop_window:
          relevant_intervals = relevant_intervals.intersect_interval(prior_to_window)
        return covered_intervals.union(relevant_intervals).size - covered_intervals.size

      nodes_remaining = list(leaf_nodes)

      # Prefer local nodes first (and do *not* drop the tolerance window)
      for node in leaf_nodes:
        if node.local and measure_of_added_coverage(node, False) > 0:
          nodes_remaining.remove(node)
          minimal_node_set.add(node)
          covered_intervals = covered_intervals.union(node.intervals)

      if settings.REMOTE_STORE_MERGE_RESULTS:
        remote_nodes = [n for n in nodes_remaining if not n.local]
        for node in remote_nodes:
          nodes_remaining.remove(node)
          minimal_node_set.add(node)
          covered_intervals = covered_intervals.union(node.intervals)
      else:
        while nodes_remaining:
          node_coverages = [ (measure_of_added_coverage(n), n) for n in nodes_remaining ]
          best_coverage, best_node = max(node_coverages)

          if best_coverage == 0:
            break

          nodes_remaining.remove(best_node)
          minimal_node_set.add(best_node)
          covered_intervals = covered_intervals.union(best_node.intervals)

        # Sometimes the requested interval falls within the caching window.
        # We include the most likely node if the gap is within tolerance.
        if not minimal_node_set:
          def distance_to_requested_interval(node):
            latest = sorted(node.intervals, key=lambda i: i.end)[-1]
            distance = query.interval.start - latest.end
            return distance if distance >= 0 else float('inf')

          best_candidate = min(leaf_nodes, key=distance_to_requested_interval)
          if distance_to_requested_interval(best_candidate) <= settings.FIND_TOLERANCE:
            minimal_node_set.add(best_candidate)

      if len(minimal_node_set) == 1:
        yield minimal_node_set.pop()
      elif len(minimal_node_set) > 1:
        reader = MultiReader(minimal_node_set)
        yield LeafNode(path, reader)
Esempio n. 20
0
    def find_all(self, query, headers=None):
        start = time.time()
        result_queue = Queue.Queue()
        jobs = []

        # Start remote searches
        if not query.local:
            random.shuffle(self.remote_stores)
            jobs.extend([(store.find, query, headers)
                         for store in self.remote_stores if store.available])

        # Start local searches
        for finder in self.finders:
            jobs.append((finder.find_nodes, query))

        if settings.USE_WORKER_POOL:
            return_result = lambda x: result_queue.put(x)
            for job in jobs:
                get_pool().apply_async(func=job[0],
                                       args=job[1:],
                                       callback=return_result)
        else:
            for job in jobs:
                result_queue.put(job[0](*job[1:]))

        # Group matching nodes by their path
        nodes_by_path = defaultdict(list)

        deadline = start + settings.REMOTE_FIND_TIMEOUT
        result_cnt = 0

        while result_cnt < len(jobs):
            wait_time = deadline - time.time()

            try:
                nodes = result_queue.get(True, wait_time)

            # ValueError could happen if due to really unlucky timing wait_time is negative
            except (Queue.Empty, ValueError):
                if time.time() > deadline:
                    log.info("Timed out in find_all after %fs" %
                             (settings.REMOTE_FIND_TIMEOUT))
                    break
                else:
                    continue

            log.info("Got a find result after %fs" % (time.time() - start))
            result_cnt += 1
            if nodes:
                for node in nodes:
                    nodes_by_path[node.path].append(node)

        log.info("Got all find results in %fs" % (time.time() - start))

        # Reduce matching nodes for each path to a minimal set
        found_branch_nodes = set()

        items = list(nodes_by_path.iteritems())
        random.shuffle(items)

        for path, nodes in items:
            leaf_nodes = []

            # First we dispense with the BranchNodes
            for node in nodes:
                if node.is_leaf:
                    leaf_nodes.append(node)
                elif node.path not in found_branch_nodes:  #TODO need to filter branch nodes based on requested interval... how?!?!?
                    yield node
                    found_branch_nodes.add(node.path)

            if not leaf_nodes:
                continue

            # Fast-path when there is a single node.
            if len(leaf_nodes) == 1:
                yield leaf_nodes[0]
                continue

            # Calculate best minimal node set
            minimal_node_set = set()
            covered_intervals = IntervalSet([])

            # If the query doesn't fall entirely within the FIND_TOLERANCE window
            # we disregard the window. This prevents unnecessary remote fetches
            # caused when carbon's cache skews node.intervals, giving the appearance
            # remote systems have data we don't have locally, which we probably do.
            now = int(time.time())
            tolerance_window = now - settings.FIND_TOLERANCE
            disregard_tolerance_window = query.interval.start < tolerance_window
            prior_to_window = Interval(float('-inf'), tolerance_window)

            def measure_of_added_coverage(
                    node, drop_window=disregard_tolerance_window):
                relevant_intervals = node.intervals.intersect_interval(
                    query.interval)
                if drop_window:
                    relevant_intervals = relevant_intervals.intersect_interval(
                        prior_to_window)
                return covered_intervals.union(
                    relevant_intervals).size - covered_intervals.size

            nodes_remaining = list(leaf_nodes)

            # Prefer local nodes first (and do *not* drop the tolerance window)
            for node in leaf_nodes:
                if node.local and measure_of_added_coverage(node, False) > 0:
                    nodes_remaining.remove(node)
                    minimal_node_set.add(node)
                    covered_intervals = covered_intervals.union(node.intervals)

            if settings.REMOTE_STORE_MERGE_RESULTS:
                remote_nodes = [n for n in nodes_remaining if not n.local]
                for node in remote_nodes:
                    nodes_remaining.remove(node)
                    minimal_node_set.add(node)
                    covered_intervals = covered_intervals.union(node.intervals)
            else:
                while nodes_remaining:
                    node_coverages = [(measure_of_added_coverage(n), n)
                                      for n in nodes_remaining]
                    best_coverage, best_node = max(node_coverages)

                    if best_coverage == 0:
                        break

                    nodes_remaining.remove(best_node)
                    minimal_node_set.add(best_node)
                    covered_intervals = covered_intervals.union(
                        best_node.intervals)

                # Sometimes the requested interval falls within the caching window.
                # We include the most likely node if the gap is within tolerance.
                if not minimal_node_set:

                    def distance_to_requested_interval(node):
                        if not node.intervals:
                            return float('inf')
                        latest = sorted(node.intervals,
                                        key=lambda i: i.end)[-1]
                        distance = query.interval.start - latest.end
                        return distance if distance >= 0 else float('inf')

                    best_candidate = min(leaf_nodes,
                                         key=distance_to_requested_interval)
                    if distance_to_requested_interval(
                            best_candidate) <= settings.FIND_TOLERANCE:
                        minimal_node_set.add(best_candidate)

            if len(minimal_node_set) == 1:
                yield minimal_node_set.pop()
            elif len(minimal_node_set) > 1:
                reader = MultiReader(minimal_node_set)
                yield LeafNode(path, reader)
Esempio n. 21
0
 def get_intervals(self):
     # all time
     return IntervalSet([Interval(0, int(time.time()))])
Esempio n. 22
0
    def _merge_leaf_nodes(self, query, path, leaf_nodes):
        """Get a single node from a list of leaf nodes."""
        if not leaf_nodes:
            return None

        # Fast-path when there is a single node.
        if len(leaf_nodes) == 1:
            return leaf_nodes[0]

        # Calculate best minimal node set
        minimal_node_set = set()
        covered_intervals = IntervalSet([])

        # If the query doesn't fall entirely within the FIND_TOLERANCE window
        # we disregard the window. This prevents unnecessary remote fetches
        # caused when carbon's cache skews node.intervals, giving the appearance
        # remote systems have data we don't have locally, which we probably
        # do.
        now = int(time.time())
        tolerance_window = now - settings.FIND_TOLERANCE
        disregard_tolerance_window = query.interval.start < tolerance_window
        prior_to_window = Interval(float('-inf'), tolerance_window)

        def measure_of_added_coverage(node,
                                      drop_window=disregard_tolerance_window):
            relevant_intervals = node.intervals.intersect_interval(
                query.interval)
            if drop_window:
                relevant_intervals = relevant_intervals.intersect_interval(
                    prior_to_window)
            return covered_intervals.union(
                relevant_intervals).size - covered_intervals.size

        nodes_remaining = list(leaf_nodes)

        # Prefer local nodes first (and do *not* drop the tolerance window)
        for node in leaf_nodes:
            if node.local and measure_of_added_coverage(node, False) > 0:
                nodes_remaining.remove(node)
                minimal_node_set.add(node)
                covered_intervals = covered_intervals.union(node.intervals)

        if settings.REMOTE_STORE_MERGE_RESULTS:
            remote_nodes = [n for n in nodes_remaining if not n.local]
            for node in remote_nodes:
                nodes_remaining.remove(node)
                minimal_node_set.add(node)
                covered_intervals = covered_intervals.union(node.intervals)
        else:
            while nodes_remaining:
                node_coverages = [(measure_of_added_coverage(n), n)
                                  for n in nodes_remaining]
                best_coverage, best_node = max(node_coverages)

                if best_coverage == 0:
                    break

                nodes_remaining.remove(best_node)
                minimal_node_set.add(best_node)
                covered_intervals = covered_intervals.union(
                    best_node.intervals)

            # Sometimes the requested interval falls within the caching window.
            # We include the most likely node if the gap is within
            # tolerance.
            if not minimal_node_set:

                def distance_to_requested_interval(node):
                    if not node.intervals:
                        return float('inf')
                    latest = sorted(node.intervals, key=lambda i: i.end)[-1]
                    distance = query.interval.start - latest.end
                    return distance if distance >= 0 else float('inf')

                best_candidate = min(leaf_nodes,
                                     key=distance_to_requested_interval)
                if distance_to_requested_interval(
                        best_candidate) <= settings.FIND_TOLERANCE:
                    minimal_node_set.add(best_candidate)

        if not minimal_node_set:
            return None
        elif len(minimal_node_set) == 1:
            return minimal_node_set.pop()
        else:
            reader = MultiReader(minimal_node_set)
            return LeafNode(path, reader)
Esempio n. 23
0
    def get_intervals(self):

        # TODO use borinud summary
        #return IntervalSet([Interval(start, end)])
        return IntervalSet([Interval(0, int(time.time()))])