Exemple #1
0
  def delete(self, start_id, end_id):
    shards = list(self.get_overlapping_shards(uuid_to_kronos_time(start_id),
                                              uuid_to_kronos_time(end_id)))

    def delete_from_shard(shard):
      batch_stmt = BatchStatement(batch_type=BatchType.UNLOGGED,
                                  consistency_level=ConsistencyLevel.QUORUM)
      num_deleted = 0
      shard = StreamShard(self.namespace, self.stream,
                          shard['start_time'], shard['width'],
                          shard['shard'], False,
                          MAX_LIMIT, read_size=self.read_size)
      for _id in shard.ids_iterator(start_id, end_id):
        if _id == start_id:
          continue
        num_deleted += 1
        batch_stmt.add(BoundStatement(self.namespace.DELETE_STMT,
                                      routing_key=shard.key,
                                      consistency_level=ConsistencyLevel.QUORUM)
                       .bind((shard.key, _id)))
      self.session.execute(batch_stmt)
      return num_deleted

    for i, shard in enumerate(shards):
      shards[i] = execute_greenlet_async(delete_from_shard, shard)
    wait(shards)

    errors = []
    num_deleted = 0
    for shard in shards:
      try:
        num_deleted += shard.get()
      except Exception, e:
        errors.append(repr(e))
Exemple #2
0
    def retrieve(self,
                 namespace,
                 stream,
                 start_time,
                 end_time,
                 start_id,
                 configuration,
                 order=ResultOrder.ASCENDING,
                 limit=sys.maxint):
        """
    Retrieves all the events for `stream` from `start_time` (inclusive) till
    `end_time` (inclusive). Alternatively to `start_time`, `start_id` can be
    provided, and then all events from `start_id` (exclusive) till `end_time`
    (inclusive) are returned. `start_id` should be used in cases when the client
    got disconnected from the server before all the events in the requested
    time window had been returned. `order` can be one of ResultOrder.ASCENDING
    or ResultOrder.DESCENDING.

    Returns an iterator over all JSON serialized (strings) events.
    """
        if not start_id:
            start_id = uuid_from_kronos_time(start_time, _type=UUIDType.LOWEST)
        else:
            start_id = TimeUUID(start_id)
        if uuid_to_kronos_time(start_id) > end_time:
            return []
        return self._retrieve(namespace, stream, start_id, end_time, order,
                              limit, configuration)
Exemple #3
0
 def _delete(self, namespace, stream, start_id, end_time, configuration):
     """
 Delete events with id > `start_id` and end_time <= `end_time`.
 """
     start_time = uuid_to_kronos_time(start_id)
     body_query = {
         'query': {
             'filtered': {
                 'query': {
                     'match_all': {}
                 },
                 'filter': {
                     'bool': {
                         'should': [{
                             'range': {
                                 TIMESTAMP_FIELD: {
                                     'gt': start_time,
                                     'lte': end_time
                                 }
                             }
                         }, {
                             'bool': {
                                 'must': [{
                                     'range': {
                                         ID_FIELD: {
                                             'gt': str(start_id)
                                         }
                                     }
                                 }, {
                                     'term': {
                                         TIMESTAMP_FIELD: start_time
                                     }
                                 }]
                             }
                         }]
                     }
                 }
             }
         }
     }
     query = {
         'index': self.index_manager.get_index(namespace),
         'doc_type': stream,
         'body': body_query,
         'ignore': 404,
         'allow_no_indices': True,
         'ignore_unavailable': True
     }
     try:
         # XXX: ElasticSearch does not return stats on deletions.
         # https://github.com/elasticsearch/elasticsearch/issues/6519
         count = self.es.count(**query).get('count', 0)
         if count:
             self.es.delete_by_query(**query)
         return count, []
     except Exception, e:
         return 0, [repr(e)]
Exemple #4
0
        def actions():
            for _id, event in events:
                dt = kronos_time_to_datetime(uuid_to_kronos_time(_id))
                start_dts_to_add.add(_round_datetime_down(dt))
                event['_index'] = index
                event['_type'] = stream
                event[LOGSTASH_TIMESTAMP_FIELD] = dt.isoformat()

                yield event
Exemple #5
0
    def actions():
      for _id, event in events:
        dt = kronos_time_to_datetime(uuid_to_kronos_time(_id))
        start_dts_to_add.add(_round_datetime_down(dt))
        event['_index'] = index
        event['_type'] = stream
        event[LOGSTASH_TIMESTAMP_FIELD] = dt.isoformat()

        yield event
Exemple #6
0
 def delete(self, namespace, stream, start_time, end_time, start_id,
            configuration):
   if not start_id:
     start_id = uuid_from_kronos_time(start_time, _type=UUIDType.LOWEST)
   else:
     start_id = TimeUUID(start_id)
   if uuid_to_kronos_time(start_id) > end_time:
     return 0      
   return self._delete(namespace, stream, start_id, end_time, configuration)
Exemple #7
0
 def delete(self, namespace, stream, start_time, end_time, start_id,
            configuration):
     if not start_id:
         start_id = uuid_from_kronos_time(start_time - 1,
                                          _type=UUIDType.HIGHEST)
     else:
         start_id = TimeUUID(start_id)
     if uuid_to_kronos_time(start_id) > end_time:
         return 0
     return self._delete(namespace, stream, start_id, end_time,
                         configuration)
Exemple #8
0
 def _delete(self, namespace, stream, start_id, end_time, configuration):
   """
   Delete events with id > `start_id` and end_time <= `end_time`.
   """
   start_time = uuid_to_kronos_time(start_id)
   body_query = {
     'query': {
       'filtered': {
         'query': {'match_all': {}},
         'filter': {
           'bool': {
             'should': [
               {
                 'range': {TIMESTAMP_FIELD: {'gt': start_time,
                                             'lte': end_time}}
               },
               {
                 'bool': {
                   'must': [
                     {'range': {ID_FIELD: {'gt': str(start_id)}}},
                     {'term': {TIMESTAMP_FIELD: start_time}}
                   ]
                 }
               }
             ]
           }
         }
       }
     }
   }
   query = {'index': self.index_manager.get_index(namespace),
            'doc_type': stream,
            'body': body_query,
            'ignore': 404,
            'allow_no_indices': True,
            'ignore_unavailable': True}
   try:
     # XXX: ElasticSearch does not return stats on deletions.
     # https://github.com/elasticsearch/elasticsearch/issues/6519
     count = self.es.count(**query).get('count', 0)
     if count:
       self.es.delete_by_query(**query)
     return count, []
   except Exception, e:
     return 0, [repr(e)]
Exemple #9
0
  def retrieve(self, namespace, stream, start_time, end_time, start_id,
               configuration, order=ResultOrder.ASCENDING, limit=sys.maxint):
    """
    Retrieves all the events for `stream` from `start_time` (inclusive) till
    `end_time` (inclusive). Alternatively to `start_time`, `start_id` can be
    provided, and then all events from `start_id` (exclusive) till `end_time`
    (inclusive) are returned. `start_id` should be used in cases when the client
    got disconnected from the server before all the events in the requested
    time window had been returned. `order` can be one of ResultOrder.ASCENDING
    or ResultOrder.DESCENDING.

    Returns an iterator over all JSON serialized (strings) events.
    """
    if not start_id:
      start_id = uuid_from_kronos_time(start_time, _type=UUIDType.LOWEST)
    else:
      start_id = TimeUUID(start_id)
    if uuid_to_kronos_time(start_id) > end_time:
      return []
    return self._retrieve(namespace, stream, start_id, end_time, order, limit,
                          configuration)
Exemple #10
0
    def _retrieve(self, namespace, stream, start_id, end_time, order, limit,
                  configuration):
        """
    Yield events from stream starting after the event with id `start_id` until
    and including events with timestamp `end_time`.
    """
        indices = self.index_manager.get_aliases(namespace,
                                                 uuid_to_kronos_time(start_id),
                                                 end_time)
        if not indices:
            return

        end_id = uuid_from_kronos_time(end_time, _type=UUIDType.HIGHEST)
        end_id.descending = start_id.descending = descending = (
            order == ResultOrder.DESCENDING)

        start_time = uuid_to_kronos_time(start_id)
        body_query = {
            'query': {
                'filtered': {
                    'query': {
                        'match_all': {}
                    },
                    'filter': {
                        'range': {
                            TIMESTAMP_FIELD: {
                                'gte': start_time,
                                'lte': end_time
                            }
                        }
                    }
                }
            }
        }
        order = 'desc' if descending else 'asc'
        sort_query = [
            '%s:%s' % (TIMESTAMP_FIELD, order),
            '%s:%s' % (ID_FIELD, order)
        ]

        last_id = end_id if descending else start_id
        scroll_id = None
        while True:
            size = max(
                min(limit, configuration['read_size']) / self.shards, 10)
            if scroll_id is None:
                res = self.es.search(index=indices,
                                     doc_type=stream,
                                     size=size,
                                     body=body_query,
                                     sort=sort_query,
                                     _source=True,
                                     scroll='1m',
                                     ignore=[400, 404],
                                     allow_no_indices=True,
                                     ignore_unavailable=True)
            else:
                res = self.es.scroll(scroll_id, scroll='1m')
            if '_scroll_id' not in res:
                break
            scroll_id = res['_scroll_id']
            hits = res.get('hits', {}).get('hits')
            if not hits:
                break

            for hit in hits:
                _id = TimeUUID(hit['_id'], descending=descending)
                if _id <= last_id:
                    continue
                last_id = _id
                event = hit['_source']
                del event[LOGSTASH_TIMESTAMP_FIELD]
                yield json.dumps(event)
                limit -= 1
                if limit == 0:
                    break

        if scroll_id is not None:
            self.es.clear_scroll(scroll_id)
Exemple #11
0
  def iterator(self, start_id, end_id, descending, limit):
    start_id.descending = end_id.descending = descending

    shards = self.get_overlapping_shards(uuid_to_kronos_time(start_id),
                                         uuid_to_kronos_time(end_id))
    shards = sorted(map(lambda shard: StreamShard(self.namespace,
                                                  self.stream,
                                                  shard['start_time'],
                                                  shard['width'],
                                                  shard['shard'],
                                                  descending,
                                                  limit,
                                                  self.read_size),
                        shards))
    iterators = {}
    event_heap = []
    shards_to_load = []

    def load_next_shards(cmp_id):
      """
      Pulls the earliest event from the next earliest shard and puts it into the
      event heap.
      """
      while shards and shards[0].cmp_id <= cmp_id:
        shard = shards.pop(0)
        shard.start_fetching_events_async(start_id, end_id)
        shards_to_load.append(shard)
      while shards_to_load:
        shard = shards_to_load.pop(0)
        it = shard.iterator(start_id, end_id)
        try:
          event = it.next()
          heapq.heappush(event_heap, event)
          iterators[shard] = it
        except StopIteration:
          pass

    def load_overlapping_shards():
      """
      Given what the current most recently loaded event is, loads any
      shards that might overlap with that event. Multiple shards
      might overlap because they have overlapping time slices.
      """
      while not event_heap and shards:
        # Try to pull events from unread shards.
        load_next_shards(shards[0].cmp_id)

      if event_heap and shards:
        # Pull events from all shards that overlap with the next event to be
        # yielded.
        load_next_shards(event_heap[0].id)
      elif not iterators:
        # No events in the heap and no active iterators? We're done!
        return

      shards_with_events = set(event.stream_shard for event in event_heap)
      for shard in iterators.keys():
        if shard in shards_with_events:
          continue
        try:
          it = iterators[shard]
          event = it.next()
          heapq.heappush(event_heap, event)
        except StopIteration:
          del iterators[shard]

    def _iterator(limit):
      load_overlapping_shards()  # bootstrap.

      # No events?
      if not event_heap:
        raise StopIteration

      while event_heap or shards:
        if limit <= 0:
          raise StopIteration
        if event_heap:
          # Get the next event to return.
          event = heapq.heappop(event_heap)
          # Note: in descending conditions below, we flip `<` for
          # `>` and `>=` for `<=` UUID comparator logic is flipped.
          if ((not descending and event.id > end_id) or
              (descending and event.id > start_id)):
            raise StopIteration
          elif ((not descending and event.id >= start_id) or
                (descending and event.id >= end_id)):
            limit -= 1
            yield event

        load_overlapping_shards()

    for event in _iterator(limit):
      yield event
Exemple #12
0
  def _retrieve(self, namespace, stream, start_id,
                  end_time, order, limit, configuration):
    """
    Yield events from stream starting after the event with id `start_id` until
    and including events with timestamp `end_time`.
    """
    indices = self.index_manager.get_aliases(namespace,
                                             uuid_to_kronos_time(start_id),
                                             end_time)
    if not indices:
      return

    end_id = uuid_from_kronos_time(end_time, _type=UUIDType.HIGHEST)
    end_id.descending = start_id.descending = descending = (
      order == ResultOrder.DESCENDING)
    
    start_time = uuid_to_kronos_time(start_id)
    body_query = {
      'query': {
        'filtered': {
          'query': {'match_all': {}},
          'filter': {
            'range': {TIMESTAMP_FIELD: {'gte': start_time, 'lte': end_time}}
            }
          }
        }
      }
    order = 'desc' if descending else 'asc'
    sort_query = [
      '%s:%s' % (TIMESTAMP_FIELD, order),
      '%s:%s' % (ID_FIELD, order)
      ]

    last_id = end_id if descending else start_id
    scroll_id = None
    while True:
      size = max(min(limit, configuration['read_size']) / self.shards, 10)
      if scroll_id is None:
        res = self.es.search(index=indices,
                             doc_type=stream,
                             size=size,
                             body=body_query,
                             sort=sort_query,
                             _source=True,
                             scroll='1m',
                             ignore=[400, 404],
                             allow_no_indices=True,
                             ignore_unavailable=True)
      else:
        res = self.es.scroll(scroll_id, scroll='1m')
      if '_scroll_id' not in res:
        break
      scroll_id = res['_scroll_id']
      hits = res.get('hits', {}).get('hits')
      if not hits:
        break

      for hit in hits:
        _id = TimeUUID(hit['_id'], descending=descending)
        if _id <= last_id:
          continue
        last_id = _id
        event = hit['_source']
        yield json.dumps(event)
        limit -= 1
        if limit == 0:
          break

    if scroll_id is not None:
      self.es.clear_scroll(scroll_id)