Ejemplo n.º 1
0
  def __init__(self, cluster, name, replication_factor, read_size):
    self.cluster = cluster
    self.name = name
    self.replication_factor = replication_factor
    self.read_size = read_size
    self.session = None

    # Create session.
    self.create_session()

    # Cache for Stream instances.
    self.stream_cache = InMemoryLRUCache(max_items=1000)
Ejemplo n.º 2
0
  def __init__(self, namespace, stream, width, shards, read_size):
    self.session = namespace.session
    self.read_size = read_size
    self.stream = stream
    self.shards = shards
    self.width = width
    self.namespace = namespace

    # Index cache is a write cache: it prevents us from writing to the
    # bucket index if we've already updated it in a previous
    # operation.
    self.index_cache = InMemoryLRUCache(max_items=1000)
Ejemplo n.º 3
0
    def __init__(self, storage):
        self.es = storage.es
        self.index_prefix = storage.index_prefix
        self.rollover_size = storage.rollover_size
        self.rollover_check_period_seconds = storage.rollover_check_period_seconds
        self.namespaces = storage.namespaces
        self.namespace_to_metadata = {}

        # Alias cache is a write cache. It prevents us creating an alias for an
        # index if we've already created it in a previous operation.
        self.alias_cache = defaultdict(
            lambda: InMemoryLRUCache(max_items=1000))

        self.update()

        self.rollover_worker = gevent.spawn(self.update_periodic)
        atexit.register(self.kill_rollover_worker)
Ejemplo n.º 4
0
class Namespace(object):
  # Namespace-level CQL statements.
  CREATE_KEYSPACE_CQL = """CREATE KEYSPACE %s WITH
    REPLICATION = {'class': 'SimpleStrategy',
                   'replication_factor': %d}"""
  DROP_KEYSPACE_CQL = """DROP KEYSPACE %s"""
  # key is of the form: "<stream_name>:<start_time>:<time_width>"
  STREAM_CQL = """CREATE TABLE stream (
    key text,
    id timeuuid,
    blob text,
    PRIMARY KEY (key, id)
  )"""
  INDEX_CQL = """CREATE TABLE idx (
    stream text,
    start_time bigint,
    width bigint,
    shard int,
    PRIMARY KEY (stream, start_time, width, shard)
  )"""
  STREAM_LIST_CQL = """SELECT DISTINCT stream FROM idx"""

  # Stream-level CQL statements.
  DELETE_STMT = """DELETE FROM stream WHERE
    key = ? AND
    id = ?"""
  INSERT_STMT = """INSERT INTO stream (key, id, blob)
    VALUES (?, ?, ?)"""
  INDEX_INSERT_STMT = """INSERT INTO idx (stream, start_time, width, shard)
    VALUES (?, ?, ?, ?)"""
  INDEX_SCAN_STMT = """SELECT start_time, width, shard FROM idx WHERE
    stream = ? AND
    start_time >= ? AND
    start_time < ?"""

  # StreamShard-level CQL statements.
  SELECT_ASC_STMT = """SELECT id, blob FROM stream WHERE
    key = ? AND
    id >= ? AND
    id <= ?
    ORDER BY id ASC
    LIMIT ?
    """
  SELECT_DESC_STMT = """SELECT id, blob FROM stream WHERE
    key = ? AND
    id >= ? AND
    id <= ?
    ORDER BY id DESC
    LIMIT ?
    """
  SELECT_ID_STMT = """SELECT id FROM stream WHERE
    key = ? AND
    id >= ? AND
    id <= ?
    LIMIT ?
    """

  def __init__(self, cluster, name, replication_factor, read_size):
    self.cluster = cluster
    self.name = name
    self.replication_factor = replication_factor
    self.read_size = read_size
    self.session = None

    # Create session.
    self.create_session()

    # Cache for Stream instances.
    self.stream_cache = InMemoryLRUCache(max_items=1000)

  def get_stream(self, stream_name, width, shards):
    # width and shard settings change requires a restart of kronosd, so we can
    # just cache on stream name.
    try:
      return self.stream_cache.get(stream_name)
    except KeyError:
      stream = Stream(self, stream_name, width, shards, self.read_size)
      self.stream_cache.set(stream_name, stream)
      return stream

  def list_streams(self):
    for stream in self.session.execute(
      SimpleStatement(Namespace.STREAM_LIST_CQL,
                      consistency_level=ConsistencyLevel.QUORUM)):
      yield stream[0]

  def create_session(self):
    if self.session:
      raise CassandraStorageError

    try:
      session = self.cluster.connect(self.name)
    except cassandra.InvalidRequest, e:
      if "Keyspace '%s' does not exist" % self.name not in e.message:
        raise e
      session = self.cluster.connect()

      # Create keyspace for namespace.
      session.execute(Namespace.CREATE_KEYSPACE_CQL %
                      (self.name, self.replication_factor))
      session.set_keyspace(self.name)

      # Create column families + indices.
      session.execute(Namespace.STREAM_CQL)
      session.execute(Namespace.INDEX_CQL)

    self.session = session

    # Prepare statements for this session.
    for attr, value in Namespace.__dict__.iteritems():
      if not (attr.upper() == attr and attr.endswith('_STMT')):
        continue
      setattr(self, attr, self.session.prepare(value))
Ejemplo n.º 5
0
class Stream(object):
  # 6 months.
  MAX_WIDTH = int(timedelta(days=365.25).total_seconds() * 1e7) / 2

  def __init__(self, namespace, stream, width, shards, read_size):
    self.session = namespace.session
    self.read_size = read_size
    self.stream = stream
    self.shards = shards
    self.width = width
    self.namespace = namespace

    # Index cache is a write cache: it prevents us from writing to the
    # bucket index if we've already updated it in a previous
    # operation.
    self.index_cache = InMemoryLRUCache(max_items=1000)

  def get_overlapping_shards(self, start_time, end_time):
    index_scan_stmt = BoundStatement(self.namespace.INDEX_SCAN_STMT)
    potential_shards = self.session.execute(
      index_scan_stmt.bind((self.stream,
                            max(start_time - Stream.MAX_WIDTH, 0),
                            end_time))
    )
    shards = defaultdict(lambda: defaultdict(int))
    for (shard_time, width, shard) in potential_shards:
      if shard_time + width < start_time:
        # end_time < shard start_time?
        continue
      shards[shard_time][shard] = max(shards[shard_time][shard], width)
    for shard_time, _ in shards.iteritems():
      for shard, width in _.iteritems():
        yield {'start_time': shard_time,
               'width': width,
               'shard': shard}

  def insert(self, events):
    if not events:
      return
    batch_stmt = BatchStatement(batch_type=BatchType.UNLOGGED,
                                consistency_level=ConsistencyLevel.QUORUM)

    shard_idx = {}
    for _id, event in events:
      shard_time = round_down(event[TIMESTAMP_FIELD], self.width)
      shard = shard_idx.get(shard_time,
                            random.randint(0, self.shards - 1))

      # Insert to index.
      try:
        self.index_cache.get((shard_time, shard))
      except KeyError:
        batch_stmt.add(BoundStatement(self.namespace.INDEX_INSERT_STMT,
                                      routing_key=self.stream,
                                      consistency_level=ConsistencyLevel.QUORUM)
                       .bind((self.stream, shard_time, self.width, shard)))
        self.index_cache.set((shard_time, shard), None)

      # Insert to stream.
      shard_key = StreamShard.get_key(self.stream, shard_time, shard)
      batch_stmt.add(BoundStatement(self.namespace.INSERT_STMT,
                                    routing_key=shard_key,
                                    consistency_level=ConsistencyLevel.QUORUM)
                     .bind((shard_key,
                            _id,
                            marshal.dumps(event))))
      shard_idx[shard_time] = (shard + 1) % self.shards  # Round robin.

    self.session.execute(batch_stmt)

  def iterator(self, start_id, end_id, descending, limit):
    start_id.descending = end_id.descending = descending

    shards = self.get_overlapping_shards(uuid_to_kronos_time(start_id),
                                         uuid_to_kronos_time(end_id))
    shards = sorted(map(lambda shard: StreamShard(self.namespace,
                                                  self.stream,
                                                  shard['start_time'],
                                                  shard['width'],
                                                  shard['shard'],
                                                  descending,
                                                  limit,
                                                  self.read_size),
                        shards))
    iterators = {}
    event_heap = []
    shards_to_load = []

    def load_next_shards(cmp_id):
      """
      Pulls the earliest event from the next earliest shard and puts it into the
      event heap.
      """
      while shards and shards[0].cmp_id <= cmp_id:
        shard = shards.pop(0)
        shard.start_fetching_events_async(start_id, end_id)
        shards_to_load.append(shard)
      while shards_to_load:
        shard = shards_to_load.pop(0)
        it = shard.iterator(start_id, end_id)
        try:
          event = it.next()
          heapq.heappush(event_heap, event)
          iterators[shard] = it
        except StopIteration:
          pass

    def load_overlapping_shards():
      """
      Given what the current most recently loaded event is, loads any
      shards that might overlap with that event. Multiple shards
      might overlap because they have overlapping time slices.
      """
      while not event_heap and shards:
        # Try to pull events from unread shards.
        load_next_shards(shards[0].cmp_id)

      if event_heap and shards:
        # Pull events from all shards that overlap with the next event to be
        # yielded.
        load_next_shards(event_heap[0].id)
      elif not iterators:
        # No events in the heap and no active iterators? We're done!
        return

      shards_with_events = set(event.stream_shard for event in event_heap)
      for shard in iterators.keys():
        if shard in shards_with_events:
          continue
        try:
          it = iterators[shard]
          event = it.next()
          heapq.heappush(event_heap, event)
        except StopIteration:
          del iterators[shard]

    def _iterator(limit):
      load_overlapping_shards()  # bootstrap.

      # No events?
      if not event_heap:
        raise StopIteration

      while event_heap or shards:
        if limit <= 0:
          raise StopIteration
        if event_heap:
          # Get the next event to return.
          event = heapq.heappop(event_heap)
          # Note: in descending conditions below, we flip `<` for
          # `>` and `>=` for `<=` UUID comparator logic is flipped.
          if ((not descending and event.id > end_id) or
              (descending and event.id > start_id)):
            raise StopIteration
          elif ((not descending and event.id >= start_id) or
                (descending and event.id >= end_id)):
            limit -= 1
            yield event

        load_overlapping_shards()

    for event in _iterator(limit):
      yield event

  def delete(self, start_id, end_id):
    shards = list(self.get_overlapping_shards(uuid_to_kronos_time(start_id),
                                              uuid_to_kronos_time(end_id)))

    def delete_from_shard(shard):
      batch_stmt = BatchStatement(batch_type=BatchType.UNLOGGED,
                                  consistency_level=ConsistencyLevel.QUORUM)
      num_deleted = 0
      shard = StreamShard(self.namespace, self.stream,
                          shard['start_time'], shard['width'],
                          shard['shard'], False,
                          MAX_LIMIT, read_size=self.read_size)
      for _id in shard.ids_iterator(start_id, end_id):
        if _id == start_id:
          continue
        num_deleted += 1
        batch_stmt.add(BoundStatement(self.namespace.DELETE_STMT,
                                      routing_key=shard.key,
                                      consistency_level=ConsistencyLevel.QUORUM)
                       .bind((shard.key, _id)))
      self.session.execute(batch_stmt)
      return num_deleted

    for i, shard in enumerate(shards):
      shards[i] = execute_greenlet_async(delete_from_shard, shard)
    wait(shards)

    errors = []
    num_deleted = 0
    for shard in shards:
      try:
        num_deleted += shard.get()
      except Exception, e:
        errors.append(repr(e))

    return num_deleted, errors