def delete(self, start_id, end_id): shards = list(self.get_overlapping_shards(uuid_to_kronos_time(start_id), uuid_to_kronos_time(end_id))) def delete_from_shard(shard): batch_stmt = BatchStatement(batch_type=BatchType.UNLOGGED, consistency_level=ConsistencyLevel.QUORUM) num_deleted = 0 shard = StreamShard(self.namespace, self.stream, shard['start_time'], shard['width'], shard['shard'], False, MAX_LIMIT, read_size=self.read_size) for _id in shard.ids_iterator(start_id, end_id): if _id == start_id: continue num_deleted += 1 batch_stmt.add(BoundStatement(self.namespace.DELETE_STMT, routing_key=shard.key, consistency_level=ConsistencyLevel.QUORUM) .bind((shard.key, _id))) self.session.execute(batch_stmt) return num_deleted for i, shard in enumerate(shards): shards[i] = execute_greenlet_async(delete_from_shard, shard) wait(shards) errors = [] num_deleted = 0 for shard in shards: try: num_deleted += shard.get() except Exception, e: errors.append(repr(e))
def retrieve(self, namespace, stream, start_time, end_time, start_id, configuration, order=ResultOrder.ASCENDING, limit=sys.maxint): """ Retrieves all the events for `stream` from `start_time` (inclusive) till `end_time` (inclusive). Alternatively to `start_time`, `start_id` can be provided, and then all events from `start_id` (exclusive) till `end_time` (inclusive) are returned. `start_id` should be used in cases when the client got disconnected from the server before all the events in the requested time window had been returned. `order` can be one of ResultOrder.ASCENDING or ResultOrder.DESCENDING. Returns an iterator over all JSON serialized (strings) events. """ if not start_id: start_id = uuid_from_kronos_time(start_time, _type=UUIDType.LOWEST) else: start_id = TimeUUID(start_id) if uuid_to_kronos_time(start_id) > end_time: return [] return self._retrieve(namespace, stream, start_id, end_time, order, limit, configuration)
def _delete(self, namespace, stream, start_id, end_time, configuration): """ Delete events with id > `start_id` and end_time <= `end_time`. """ start_time = uuid_to_kronos_time(start_id) body_query = { 'query': { 'filtered': { 'query': { 'match_all': {} }, 'filter': { 'bool': { 'should': [{ 'range': { TIMESTAMP_FIELD: { 'gt': start_time, 'lte': end_time } } }, { 'bool': { 'must': [{ 'range': { ID_FIELD: { 'gt': str(start_id) } } }, { 'term': { TIMESTAMP_FIELD: start_time } }] } }] } } } } } query = { 'index': self.index_manager.get_index(namespace), 'doc_type': stream, 'body': body_query, 'ignore': 404, 'allow_no_indices': True, 'ignore_unavailable': True } try: # XXX: ElasticSearch does not return stats on deletions. # https://github.com/elasticsearch/elasticsearch/issues/6519 count = self.es.count(**query).get('count', 0) if count: self.es.delete_by_query(**query) return count, [] except Exception, e: return 0, [repr(e)]
def actions(): for _id, event in events: dt = kronos_time_to_datetime(uuid_to_kronos_time(_id)) start_dts_to_add.add(_round_datetime_down(dt)) event['_index'] = index event['_type'] = stream event[LOGSTASH_TIMESTAMP_FIELD] = dt.isoformat() yield event
def delete(self, namespace, stream, start_time, end_time, start_id, configuration): if not start_id: start_id = uuid_from_kronos_time(start_time, _type=UUIDType.LOWEST) else: start_id = TimeUUID(start_id) if uuid_to_kronos_time(start_id) > end_time: return 0 return self._delete(namespace, stream, start_id, end_time, configuration)
def delete(self, namespace, stream, start_time, end_time, start_id, configuration): if not start_id: start_id = uuid_from_kronos_time(start_time - 1, _type=UUIDType.HIGHEST) else: start_id = TimeUUID(start_id) if uuid_to_kronos_time(start_id) > end_time: return 0 return self._delete(namespace, stream, start_id, end_time, configuration)
def _delete(self, namespace, stream, start_id, end_time, configuration): """ Delete events with id > `start_id` and end_time <= `end_time`. """ start_time = uuid_to_kronos_time(start_id) body_query = { 'query': { 'filtered': { 'query': {'match_all': {}}, 'filter': { 'bool': { 'should': [ { 'range': {TIMESTAMP_FIELD: {'gt': start_time, 'lte': end_time}} }, { 'bool': { 'must': [ {'range': {ID_FIELD: {'gt': str(start_id)}}}, {'term': {TIMESTAMP_FIELD: start_time}} ] } } ] } } } } } query = {'index': self.index_manager.get_index(namespace), 'doc_type': stream, 'body': body_query, 'ignore': 404, 'allow_no_indices': True, 'ignore_unavailable': True} try: # XXX: ElasticSearch does not return stats on deletions. # https://github.com/elasticsearch/elasticsearch/issues/6519 count = self.es.count(**query).get('count', 0) if count: self.es.delete_by_query(**query) return count, [] except Exception, e: return 0, [repr(e)]
def _retrieve(self, namespace, stream, start_id, end_time, order, limit, configuration): """ Yield events from stream starting after the event with id `start_id` until and including events with timestamp `end_time`. """ indices = self.index_manager.get_aliases(namespace, uuid_to_kronos_time(start_id), end_time) if not indices: return end_id = uuid_from_kronos_time(end_time, _type=UUIDType.HIGHEST) end_id.descending = start_id.descending = descending = ( order == ResultOrder.DESCENDING) start_time = uuid_to_kronos_time(start_id) body_query = { 'query': { 'filtered': { 'query': { 'match_all': {} }, 'filter': { 'range': { TIMESTAMP_FIELD: { 'gte': start_time, 'lte': end_time } } } } } } order = 'desc' if descending else 'asc' sort_query = [ '%s:%s' % (TIMESTAMP_FIELD, order), '%s:%s' % (ID_FIELD, order) ] last_id = end_id if descending else start_id scroll_id = None while True: size = max( min(limit, configuration['read_size']) / self.shards, 10) if scroll_id is None: res = self.es.search(index=indices, doc_type=stream, size=size, body=body_query, sort=sort_query, _source=True, scroll='1m', ignore=[400, 404], allow_no_indices=True, ignore_unavailable=True) else: res = self.es.scroll(scroll_id, scroll='1m') if '_scroll_id' not in res: break scroll_id = res['_scroll_id'] hits = res.get('hits', {}).get('hits') if not hits: break for hit in hits: _id = TimeUUID(hit['_id'], descending=descending) if _id <= last_id: continue last_id = _id event = hit['_source'] del event[LOGSTASH_TIMESTAMP_FIELD] yield json.dumps(event) limit -= 1 if limit == 0: break if scroll_id is not None: self.es.clear_scroll(scroll_id)
def iterator(self, start_id, end_id, descending, limit): start_id.descending = end_id.descending = descending shards = self.get_overlapping_shards(uuid_to_kronos_time(start_id), uuid_to_kronos_time(end_id)) shards = sorted(map(lambda shard: StreamShard(self.namespace, self.stream, shard['start_time'], shard['width'], shard['shard'], descending, limit, self.read_size), shards)) iterators = {} event_heap = [] shards_to_load = [] def load_next_shards(cmp_id): """ Pulls the earliest event from the next earliest shard and puts it into the event heap. """ while shards and shards[0].cmp_id <= cmp_id: shard = shards.pop(0) shard.start_fetching_events_async(start_id, end_id) shards_to_load.append(shard) while shards_to_load: shard = shards_to_load.pop(0) it = shard.iterator(start_id, end_id) try: event = it.next() heapq.heappush(event_heap, event) iterators[shard] = it except StopIteration: pass def load_overlapping_shards(): """ Given what the current most recently loaded event is, loads any shards that might overlap with that event. Multiple shards might overlap because they have overlapping time slices. """ while not event_heap and shards: # Try to pull events from unread shards. load_next_shards(shards[0].cmp_id) if event_heap and shards: # Pull events from all shards that overlap with the next event to be # yielded. load_next_shards(event_heap[0].id) elif not iterators: # No events in the heap and no active iterators? We're done! return shards_with_events = set(event.stream_shard for event in event_heap) for shard in iterators.keys(): if shard in shards_with_events: continue try: it = iterators[shard] event = it.next() heapq.heappush(event_heap, event) except StopIteration: del iterators[shard] def _iterator(limit): load_overlapping_shards() # bootstrap. # No events? if not event_heap: raise StopIteration while event_heap or shards: if limit <= 0: raise StopIteration if event_heap: # Get the next event to return. event = heapq.heappop(event_heap) # Note: in descending conditions below, we flip `<` for # `>` and `>=` for `<=` UUID comparator logic is flipped. if ((not descending and event.id > end_id) or (descending and event.id > start_id)): raise StopIteration elif ((not descending and event.id >= start_id) or (descending and event.id >= end_id)): limit -= 1 yield event load_overlapping_shards() for event in _iterator(limit): yield event
def _retrieve(self, namespace, stream, start_id, end_time, order, limit, configuration): """ Yield events from stream starting after the event with id `start_id` until and including events with timestamp `end_time`. """ indices = self.index_manager.get_aliases(namespace, uuid_to_kronos_time(start_id), end_time) if not indices: return end_id = uuid_from_kronos_time(end_time, _type=UUIDType.HIGHEST) end_id.descending = start_id.descending = descending = ( order == ResultOrder.DESCENDING) start_time = uuid_to_kronos_time(start_id) body_query = { 'query': { 'filtered': { 'query': {'match_all': {}}, 'filter': { 'range': {TIMESTAMP_FIELD: {'gte': start_time, 'lte': end_time}} } } } } order = 'desc' if descending else 'asc' sort_query = [ '%s:%s' % (TIMESTAMP_FIELD, order), '%s:%s' % (ID_FIELD, order) ] last_id = end_id if descending else start_id scroll_id = None while True: size = max(min(limit, configuration['read_size']) / self.shards, 10) if scroll_id is None: res = self.es.search(index=indices, doc_type=stream, size=size, body=body_query, sort=sort_query, _source=True, scroll='1m', ignore=[400, 404], allow_no_indices=True, ignore_unavailable=True) else: res = self.es.scroll(scroll_id, scroll='1m') if '_scroll_id' not in res: break scroll_id = res['_scroll_id'] hits = res.get('hits', {}).get('hits') if not hits: break for hit in hits: _id = TimeUUID(hit['_id'], descending=descending) if _id <= last_id: continue last_id = _id event = hit['_source'] yield json.dumps(event) limit -= 1 if limit == 0: break if scroll_id is not None: self.es.clear_scroll(scroll_id)