def main(args): read_client = KronosClient(args.read_url, namespace=args.read_namespace) write_client = KronosClient(args.write_url, namespace=args.write_namespace, blocking=False) start_time = time.time() time_step = timedelta(seconds=args.copy_period_seconds) for stream in args.stream_file: stream = stream.rstrip() print 'Starting stream', stream, time.time() - start_time start = args.start # Keep track of the last ID we read, so we re-run queries from # there. last_read_id = None while start <= args.end: print '...start is', start, time.time() - start_time end = min(args.end, start + time_step) if last_read_id is None: read_stream = read_client.get(stream, start, end) else: read_stream = read_client.get(stream, None, end, start_id=last_read_id) for event in read_stream: if event[ID_FIELD] != last_read_id: last_read_id = event[ID_FIELD] del event[ID_FIELD] write_client.put({stream: [event]}) start += time_step write_client.flush() print 'Completed stream', stream, time.time() - start_time
def load_test_data(args): donations = ZipFile(StringIO(urllib2.urlopen(DONATIONS_FILE_URL).read())) donations = StringIO(donations.read('%s.csv' % DONATIONS_FILE_NAME)) events = [] rows = csv.DictReader(donations) for row in rows: row[TIMESTAMP_FIELD] = parse(row['contb_receipt_dt']) events.append(row) kc = KronosClient(args.kronos_url) kc.put({'donations': events})
def main(args): client = KronosClient(args.kronos_url, namespace=args.namespace, blocking=False) increment = timedelta(microseconds=args.microseconds_between_events) event = {'property%s' % (idx): idx for idx in xrange(args.properties_per_event)} start_time = time.time() for idx in xrange(args.num_events): event[TIMESTAMP_FIELD] = args.start + (idx * increment) client.put({args.stream: [event]}) if (idx % args.chunk_size) == 0: print 'Completed', idx, 'events', time.time() - start_time client.flush() client.flush()
class QueryCacheTest(unittest.TestCase): def setUp(self): self.client = KronosClient('http://localhost:9191/', blocking=False, sleep_block=0.2) self.total_events = 500 self.computed_namespace = 'computed' self.increment = timedelta(minutes=1) self.start_time = datetime(2014, 6, 4, 22) self.bucket_width = timedelta(minutes=20) def compute_cache_test(function): """A wrapper that sets up a stream with test data. The stream takes the name of the function being run, and contains `self.total_events` events. The events are each one `self.increment` apart. """ @functools.wraps(function) def wrapper(self): self.stream = 'ComputeCacheTest_%s' % (function.__name__) for i in xrange(self.total_events): self.client.put({ self.stream: [{TIMESTAMP_FIELD: self.start_time + (self.increment * i), 'a': i % 5, 'b': i}]}) self.client.flush() function(self) return wrapper def filter_and_sum(self, start_time, end_time): """Bin `self.stream` into buckets, returning the sum of `b` when `a` == 2. For all events between `start_time` and `end_time`, create an event for every 20-minute interval of events that contains the sum of `b` when `a`==2. """ events = self.client.get(self.stream, start_time, end_time) counts = defaultdict(int) grouping_minutes = timedelta_to_kronos_time(self.bucket_width) for event in events: if event['a'] == 2: counts[event['@time'] - (event['@time'] % grouping_minutes)] += event['b'] for group_time in sorted(counts.iterkeys()): yield {'@time': group_time, 'b_sum': counts[group_time]} def verify_results(self, result_func, cache, expected_results, expected_computations): with patch.object(cache, '_compute_bucket', wraps=cache._compute_bucket) as mock_method: results = result_func() self.assertEqual(mock_method.call_count, expected_computations) self.assertEqual(len(results), expected_results) result_time = self.start_time for idx, result in enumerate(results): self.assertEqual(result[TIMESTAMP_FIELD], datetime_to_kronos_time(result_time)) self.assertEqual( result['b_sum'], sum([2, 7, 12, 17] + [idx * 4 * (self.bucket_width.total_seconds() / 60)])) result_time += self.bucket_width def test_cache_exceptions(self): # Bucket width shouldn't be more granular than 1 second. def bad_bucket_width(): return QueryCache(self.client, self.filter_and_sum, self.bucket_width + timedelta(milliseconds=1), self.computed_namespace) self.assertRaises(ValueError, bad_bucket_width) # start_time and end_time should align to bucket_width boundaries. cache = QueryCache(self.client, self.filter_and_sum, self.bucket_width, self.computed_namespace) start_time = self.start_time - (self.bucket_width * 3) end_time = self.start_time + (self.total_events * self.increment) + ( self.bucket_width * 3) def bad_start_boundary(): return list( cache.retrieve_interval(start_time + timedelta(minutes=1), end_time)) self.assertRaises(ValueError, bad_start_boundary) @compute_cache_test def test_cache_layer(self): cache = QueryCache(self.client, self.filter_and_sum, self.bucket_width, self.computed_namespace) start_time = self.start_time - (self.bucket_width * 3) end_time = self.start_time + (self.total_events * self.increment) + ( self.bucket_width * 3) untrusted_time = self.start_time + ( timedelta(minutes=(self.total_events / 2) - 25)) # Verify all results were computed correctly. self.verify_results(lambda: list( cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 31) # Verify only trusted results are cached. self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Running the same operations twice should result in the same # results as before. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Expanding the time range without caching should also result in the same # results self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width)), cache, 11, 0) # But specifying compute_missing should get all results for the timerange self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width, compute_missing=True)), cache, 25, 19) # Overlapping time queries should result in the same # results as before, and benefit from the cache. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time - self.bucket_width, end_time + self.bucket_width, untrusted_time)), cache, 25, 19) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Increasing the trusted time should increase the cached results. untrusted_time = untrusted_time + timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Decreasing trusted time shouldn't remove results. untrusted_time = untrusted_time - timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 15) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # If there are two cached entries, that cached time should no # longer be returned. results = list(cache.retrieve_interval(start_time, end_time)) duplicate_result = dict(results[10]) duplicate_result['b_sum'] = 0 self.client.put({cache._scratch_stream: [duplicate_result]}, namespace=cache._scratch_namespace) self.client.flush() safe_results = list(cache.retrieve_interval(start_time, end_time)) self.assertEqual(results[:10] + results[11:], safe_results) # Rerunning the cache/computation should re-cache the corrupted # element. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 16) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Forcing computation should generate the same result set. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets( start_time, end_time, untrusted_time, force_recompute=True)), cache, 25, 31) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0)