Ejemplo n.º 3
class HbaseTSDB(TSDB):
    __slots__ = ("transport", "client", "metaTable", "dataTable")

    def __init__(self, host, port, table_prefix):
        # set up client
        self.metaTable = table_prefix + "META"
        self.dataTable = table_prefix + "DATA"
        socket = TSocket.TSocket(host, port)
        self.transport = TTransport.TBufferedTransport(socket)
        protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
        self.client = Client(protocol)
        # ensure both our tables exist
        tables = self.client.getTableNames()
        if self.metaTable not in tables:
            self.client.createTable(self.metaTable, [ColumnDescriptor("cf:")])
            # add counter record
            self.client.atomicIncrement(self.metaTable, "CTR", "cf:CTR", 1)
        if self.dataTable not in tables:
            self.client.createTable(self.dataTable, [ColumnDescriptor("cf:")])

    # returns info for the underlying db (including 'aggregationMethod')

    # info returned in the format
    # info = {
    #  'aggregationMethod' : aggregationTypeToMethod.get(aggregationType, 'average'),
    #  'maxRetention' : maxRetention,
    #  'xFilesFactor' : xff,
    #  'archives' : archives,
    # }
    # where archives is a list of
    # archiveInfo = {
    #  'archiveId': unique id,
    #  'secondsPerPoint' : secondsPerPoint,
    #  'points' : points, number of points per
    #  'retention' : secondsPerPoint    * points,
    #  'size' : points * pointSize,
    # }
    def info(self, metric):
        # info is stored as serialized map under META#METRIC
        key = "m_" + metric
        result = self.client.get(self.metaTable, "m_" + metric, "cf:INFO", None)
        if len(result) == 0:
            raise Exception("No metric " + metric)
        return json.loads(result[0].value)

    # aggregationMethod specifies the method to use when propogating data (see ``whisper.aggregationMethods``)
    # xFilesFactor specifies the fraction of data points in a propagation interval that must have known values for a propagation to occur.  If None, the existing xFilesFactor in path will not be changed
    def setAggregationMethod(self, metric, aggregationMethod, xFilesFactor=None):
        currInfo = self.info(metric)
        currInfo["aggregationMethod"] = aggregationMethod
        currInfo["xFilesFactor"] = xFilesFactor

        infoJson = json.dumps(currInfo)
        self.client.mutateRow(self.metaTable, "m_" + metric, [Mutation(column="cf:INFO", value=infoJson)], None)

    # archiveList is a list of archives, each of which is of the form (secondsPerPoint,numberOfPoints)
    # xFilesFactor specifies the fraction of data points in a propagation interval that must have known values for a propagation to occur
    # aggregationMethod specifies the function to use when propogating data (see ``whisper.aggregationMethods``)
    def create(self, metric, archiveList, xFilesFactor, aggregationMethod, isSparse, doFallocate):

        # for a in archiveList:
        #    a['archiveId'] = (self.client.atomicIncrement(self.metaTable,"CTR","cf:CTR",1))

        archiveMapList = [
                "archiveId": (self.client.atomicIncrement(self.metaTable, "CTR", "cf:CTR", 1)),
                "secondsPerPoint": a[0],
                "points": a[1],
                "retention": a[0] * a[1],
            for a in archiveList
        # newId = self.client.atomicIncrement(self.metaTable,"CTR","cf:CTR",1)

        oldest = max([secondsPerPoint * points for secondsPerPoint, points in archiveList])
        # then write the metanode
        info = {
            "aggregationMethod": aggregationMethod,
            "maxRetention": oldest,
            "xFilesFactor": xFilesFactor,
            "archives": archiveMapList,
        self.client.mutateRow(self.metaTable, "m_" + metric, [Mutation(column="cf:INFO", value=json.dumps(info))], None)
        # finally, ensure links exist
        metric_parts = metric.split(".")
        priorParts = ""
        for part in metric_parts:
            # if parent is empty, special case for root
            if priorParts == "":
                metricParentKey = "ROOT"
                metricKey = "m_" + part
                priorParts = part
                metricParentKey = "m_" + priorParts
                metricKey = "m_" + priorParts + "." + part
                priorParts += "." + part

            # make sure parent of this node exists and is linked to us
            parentLink = self.client.get(self.metaTable, metricParentKey, "cf:c_" + part, None)
            if len(parentLink) == 0:
                    self.metaTable, metricParentKey, [Mutation(column="cf:c_" + part, value=metricKey)], None

    # points is a list of (timestamp,value) points
    def update_many(self, metric, points):
        info = self.info(metric)
        now = int(time.time())
        archives = iter(info["archives"])
        currentArchive = archives.next()
        currentPoints = []

        for point in points:
            age = now - point[0]

            while currentArchive["retention"] < age:  # we can't fit any more points in this archive
                if currentPoints:  # commit all the points we've found that it can fit
                    currentPoints.reverse()  # put points in chronological order
                    self.__archive_update_many(info, currentArchive, currentPoints)
                    currentPoints = []
                    currentArchive = archives.next()
                except StopIteration:
                    currentArchive = None

            if not currentArchive:
                break  # drop remaining points that don't fit in the database


        if currentArchive and currentPoints:  # don't forget to commit after we've checked all the archives
            self.__archive_update_many(info, currentArchive, currentPoints)

    def __archive_update_many(self, info, archive, points):
        numPoints = archive["points"]
        step = archive["secondsPerPoint"]
        archiveId = archive["archiveId"]
        alignedPoints = [(timestamp - (timestamp % step), value) for (timestamp, value) in points]
        alignedPoints = dict(alignedPoints).items()  # Take the last val of duplicates

        for timestamp, value in alignedPoints:
            slot = int((timestamp / step) % numPoints)
            rowkey = struct.pack(KEY_FMT, archiveId, slot)
            rowval = struct.pack(VAL_FMT, timestamp, value)
            self.client.mutateRow(self.dataTable, rowkey, [Mutation(column="cf:d", value=rowval)], None)

        # Now we propagate the updates to lower-precision archives
        higher = archive
        lowerArchives = [arc for arc in info["archives"] if arc["secondsPerPoint"] > archive["secondsPerPoint"]]

        for lower in lowerArchives:
            fit = lambda i: i - (i % lower["secondsPerPoint"])
            lowerIntervals = [fit(p[0]) for p in alignedPoints]
            uniqueLowerIntervals = set(lowerIntervals)
            propagateFurther = False
            for interval in uniqueLowerIntervals:
                if self.__propagate(info, interval, higher, lower):
                    propagateFurther = True

            if not propagateFurther:
            higher = lower

    def __propagate(self, info, timestamp, higher, lower):
        aggregationMethod = info["aggregationMethod"]
        xff = info["xFilesFactor"]

        # we want to update the items from higher between these two
        intervalStart = timestamp - (timestamp % lower["secondsPerPoint"])
        intervalEnd = intervalStart + lower["secondsPerPoint"]

        higherResData = self.__archive_fetch(higher["archiveId"], intervalStart, intervalEnd)

        known_datapts = [v for v in higherResData if v is not None]  # strip out "nones"
        if (len(known_datapts) / len(higherResData)) > xff:  # we have enough data, so propagate downwards
            aggregateValue = util.aggregate(aggregationMethod, known_datapts)
            lowerSlot = timestamp / lower["secondsPerPoint"] % lower["numPoints"]
            rowkey = struct.pack(KEY_FMT, lower["archiveId"], lowerSlot)
            rowval = struct.pack(VAL_FMT, timestamp, aggregateValue)
            self.client.mutateRow(self.dataTable, rowkey, [Mutation(column="cf:d", value=rowval)], None)

    # returns list of values between the two times.  length is endTime - startTime / secondsPerPorint.
    # should be aligned with secondsPerPoint for proper results
    def __archive_fetch(self, archive, startTime, endTime):
        step = archive["secondsPerPoint"]
        numPoints = archive["points"]
        startTime = int(startTime - (startTime % step) + step)
        endTime = int(endTime - (endTime % step) + step)
        startSlot = int((startTime / step) % numPoints)
        endSlot = int((endTime / step) % numPoints)
        if startSlot > endSlot:  # we wrapped so make 2 queries
            ranges = [(0, endSlot + 1), (startSlot, numPoints)]
            ranges = [(startSlot, endSlot + 1)]
        for t in ranges:
            startkey = struct.pack(KEY_FMT, archive["archiveId"], t[0])
            endkey = struct.pack(KEY_FMT, archive["archiveId"], t[1])
            scannerId = self.client.scannerOpenWithStop(self.dataTable, startkey, endkey, ["cf:d"], None)

            numSlots = (endTime - startTime) / archive["secondsPerPoint"]
            ret = [None] * numSlots

            for row in self.client.scannerGetList(scannerId, 100000):
                (timestamp, value) = struct.unpack(VAL_FMT, row.columns["cf:d"].value)
                if timestamp >= startTime and timestamp <= endTime:
                    returnslot = int((timestamp - startTime) / archive["secondsPerPoint"]) % numSlots
                    ret[returnslot] = value
        timeInfo = (startTime, endTime, step)
        return timeInfo, ret

    def exists(self, metric):
        return len(self.client.getRow(self.metaTable, "m_" + metric, None)) > 0

    # fromTime is an epoch time
    # untilTime is also an epoch time, but defaults to now.
    # Returns a tuple of (timeInfo, valueList)
    # where timeInfo is itself a tuple of (fromTime, untilTime, step)
    # Returns None if no data can be returned
    def fetch(self, info, fromTime, untilTime):
        now = int(time.time())
        if untilTime is None:
            untilTime = now
        fromTime = int(fromTime)
        untilTime = int(untilTime)
        if untilTime > now:
            untilTime = now
        if fromTime > untilTime:
            raise Exception("Invalid time interval: from time '%s' is after until time '%s'" % (fromTime, untilTime))

        if fromTime > now:  # from time in the future
            return None
        oldestTime = now - info["maxRetention"]
        if fromTime < oldestTime:
            fromTime = oldestTime
            # iterate archives to find the smallest
        diff = now - fromTime
        for archive in info["archives"]:
            if archive["retention"] >= diff:
        return self.__archive_fetch(archive, fromTime, untilTime)

    # returns [ start, end ] where start,end are unixtime ints
    def get_intervals(self, metric):
        start = time.time() - self.info(metric)["maxRetention"]
        end = time.time()
        return [start, end]

    # returns list of metrics as strings
    def find_nodes(self, query):
        # break query into parts
        clean_pattern = query.pattern.replace("\\", "")
        pattern_parts = clean_pattern.split(".")
        ret = self._find_paths("ROOT", pattern_parts)
        return ret

    def _find_paths(self, currNodeRowKey, patterns):
        """Recursively generates absolute paths whose components underneath current_node
        match the corresponding pattern in patterns"""

        from graphite.node import BranchNode, LeafNode
        from graphite.intervals import Interval, IntervalSet

        pattern = patterns[0]
        patterns = patterns[1:]

        nodeRow = self.client.getRow(self.metaTable, currNodeRowKey, None)
        if len(nodeRow) == 0:

        subnodes = {}
        for k, v in nodeRow[0].columns.items():
            if k.startswith("cf:c_"):  # branches start with c_
                key = k.split("_", 2)[1]  # pop off cf:c_ prefix
                subnodes[key] = v.value

        matching_subnodes = match_entries(subnodes.keys(), pattern)

        if patterns:  # we've still got more directories to traverse
            for subnode in matching_subnodes:
                rowKey = subnodes[subnode]
                subNodeContents = self.client.getRow(self.metaTable, rowKey, None)

                # leafs have a cf:INFO column describing their data
                # we can't possibly match on a leaf here because we have more components in the pattern,
                # so only recurse on branches
                if "cf:INFO" not in subNodeContents[0].columns:
                    for m in self._find_paths(rowKey, patterns):
                        yield m

        else:  # at the end of the pattern
            for subnode in matching_subnodes:
                rowKey = subnodes[subnode]
                nodeRow = self.client.getRow(self.metaTable, rowKey, None)
                if len(nodeRow) == 0:
                metric = rowKey.split("_", 2)[1]  # pop off "m_" in key
                if "cf:INFO" in nodeRow[0].columns:
                    info = json.loads(nodeRow[0].columns["cf:INFO"].value)
                    start = time.time() - info["maxRetention"]
                    end = time.time()
                    intervals = IntervalSet([Interval(start, end)])
                    reader = HbaseReader(metric, intervals, info, self)
                    yield LeafNode(metric, reader)
                    yield BranchNode(metric)
