def test_stream_inversion_timestamps(self): """ Test that if stream is inverted, timestamp of last action is the timestamp of the last action in the previous funnel step. """ from analysis import IdentityDict from analysis import _stream_earliest_action client = Mock() client.get = Mock(side_effect=[self.get_stream5()]) start = datetime.datetime(2014, 3, 20) end = datetime.datetime(2014, 3, 21) step = FunnelStep('stream5', invert=True) last_user_action = { '0': datetime_to_kronos_time(start), '1': datetime_to_kronos_time(start) } step_output = _stream_earliest_action( client, step, start, end, timedelta_to_kronos_time(datetime.timedelta(minutes=5)), last_user_action, {'userId': IdentityDict()}) user_action = step_output['user_action'] self.assertEqual(len(user_action), 1) self.assertEqual(user_action['1'], datetime_to_kronos_time(start))
def delete(self, stream, start_time, end_time, start_id=None, namespace=None): """ Delete events in the stream with name `stream` that occurred between `start_time` and `end_time` (both inclusive). An optional `start_id` allows the client to delete events starting from after an ID rather than starting at a timestamp. """ if isinstance(start_time, types.StringTypes): start_time = parse(start_time) if isinstance(end_time, types.StringTypes): end_time = parse(end_time) if isinstance(start_time, datetime): start_time = datetime_to_kronos_time(start_time) if isinstance(end_time, datetime): end_time = datetime_to_kronos_time(end_time) request_dict = { 'stream': stream, 'end_time': end_time } if start_id: request_dict['start_id'] = start_id else: request_dict['start_time'] = start_time namespace = namespace or self.namespace if namespace is not None: request_dict['namespace'] = namespace return self._make_request(self._delete_url, data=request_dict)
def test_stream_inversion_timestamps(self): """ Test that if stream is inverted, timestamp of last action is the timestamp of the last action in the previous funnel step. """ from analysis import IdentityDict from analysis import _stream_earliest_action client = Mock() client.get = Mock(side_effect=[self.get_stream5()]) start = datetime.datetime(2014,3,20) end = datetime.datetime(2014,3,21) step = FunnelStep('stream5', invert=True) last_user_action = {'0': datetime_to_kronos_time(start), '1': datetime_to_kronos_time(start)} step_output = _stream_earliest_action(client, step, start, end, timedelta_to_kronos_time( datetime.timedelta(minutes=5)), last_user_action, {'userId': IdentityDict()}) user_action = step_output['user_action'] self.assertEqual(len(user_action), 1) self.assertEqual(user_action['1'], datetime_to_kronos_time(start))
def delete(self, stream, start_time, end_time, start_id=None, namespace=None): """ Delete events in the stream with name `stream` that occurred between `start_time` and `end_time` (both inclusive). An optional `start_id` allows the client to delete events starting from after an ID rather than starting at a timestamp. """ if isinstance(start_time, types.StringTypes): start_time = parse(start_time) if isinstance(end_time, types.StringTypes): end_time = parse(end_time) if isinstance(start_time, datetime): start_time = datetime_to_kronos_time(start_time) if isinstance(end_time, datetime): end_time = datetime_to_kronos_time(end_time) request_dict = {'stream': stream, 'end_time': end_time} if start_id: request_dict['start_id'] = start_id else: request_dict['start_time'] = start_time namespace = namespace or self.namespace if namespace is not None: request_dict['namespace'] = namespace return self._make_request(self._delete_url, data=request_dict)
def process_args(): parser = argparse.ArgumentParser() parser.add_argument( '--kronos-url1', required=True, help='The first Kronos server to retrieve data from') parser.add_argument( '--kronos-url2', required=True, help='The second Kronos server to retrieve data from') parser.add_argument( '--namespace1', help='The namespace to read from the first Kronos server (optional)') parser.add_argument( '--namespace2', help='The namespace to read from the second Kronos server (optional)') parser.add_argument( '--stream1', help='The stream to read from the first Kronos server') parser.add_argument( '--stream2', help='The stream to read from the second Kronos server') parser.add_argument( '--streams-file', help='The name of the file with a stream name per line to copy') parser.add_argument( '--start', required=True, help='When to start retreiving? (format: 2003-09-25T10:49:41.5-03:00)') parser.add_argument( '--end', required=True, help='When to end retreiving? (format: 2003-09-25T10:49:41.5-03:00)') group = parser.add_argument_group(title='sampling arguments', description=('Only compare samples of ' 'events.')) group.add_argument('--num-samples', type=int, help=('Number of samples to compare?')) group.add_argument('--sample-interval', type=int, default=3600, help=('Interval of the sample (in seconds). Defaults to ' '1 hour (3600) (optional)')) args = parser.parse_args() if not bool(args.streams_file) ^ bool(args.stream1 and args.stream2): print 'Please specify either `stream-file` or both `stream1 and stream2`.' sys.exit(1) args.start = datetime_to_kronos_time(parse(args.start)) args.end = datetime_to_kronos_time(parse(args.end)) args.sample_interval = epoch_time_to_kronos_time(args.sample_interval) return args
def clear_data(self): start = CohortTest.START_DATETIME - timedelta(weeks=2) end = (CohortTest.START_DATETIME + timedelta(weeks=(2 + len(CohortTest.EMAIL_WEEKS))) + timedelta(days=CohortTest.ACTION_REPETITION_DAYS)) client.delete(CohortTest.EMAIL_STREAM, datetime_to_kronos_time(start), datetime_to_kronos_time(end)) client.delete(CohortTest.FRONTPAGE_STREAM, datetime_to_kronos_time(start), datetime_to_kronos_time(end))
def process_args(): parser = argparse.ArgumentParser() parser.add_argument('--kronos-url1', required=True, help='The first Kronos server to retrieve data from') parser.add_argument('--kronos-url2', required=True, help='The second Kronos server to retrieve data from') parser.add_argument( '--namespace1', help='The namespace to read from the first Kronos server (optional)') parser.add_argument( '--namespace2', help='The namespace to read from the second Kronos server (optional)') parser.add_argument('--stream1', help='The stream to read from the first Kronos server') parser.add_argument( '--stream2', help='The stream to read from the second Kronos server') parser.add_argument( '--streams-file', help='The name of the file with a stream name per line to copy') parser.add_argument( '--start', required=True, help='When to start retreiving? (format: 2003-09-25T10:49:41.5-03:00)') parser.add_argument( '--end', required=True, help='When to end retreiving? (format: 2003-09-25T10:49:41.5-03:00)') group = parser.add_argument_group(title='sampling arguments', description=('Only compare samples of ' 'events.')) group.add_argument('--num-samples', type=int, help=('Number of samples to compare?')) group.add_argument( '--sample-interval', type=int, default=3600, help=('Interval of the sample (in seconds). Defaults to ' '1 hour (3600) (optional)')) args = parser.parse_args() if not bool(args.streams_file) ^ bool(args.stream1 and args.stream2): print 'Please specify either `stream-file` or both `stream1 and stream2`.' sys.exit(1) args.start = datetime_to_kronos_time(parse(args.start)) args.end = datetime_to_kronos_time(parse(args.end)) args.sample_interval = epoch_time_to_kronos_time(args.sample_interval) return args
def generate_data(self): user_ids = range(700) # Email stream: Users in groups 1-5 get an email in weeks 1 and 3, # 2 and 4, 3 and 5, 4 and 1, 5 and 2, respectively. user_dates = {} # Fill in expected_output, which is of # the form: {cohort_date: {cohort_size: NN, # action_dates: {action_date: num_actions}}} self._expected = defaultdict( lambda: {'cohort_size': 0, 'action_dates': defaultdict(int)}) for user_id in user_ids: weeks1, weeks2 = CohortTest.EMAIL_WEEKS[ user_id % len(CohortTest.EMAIL_WEEKS)] week1 = CohortTest.START_DATETIME + timedelta(weeks=weeks1) date1 = week1 + timedelta(hours=randint(0, 72)) week2 = CohortTest.START_DATETIME + timedelta(weeks=weeks2) date2 = week2 + timedelta(hours=randint(0, 72)) week1 = datetime_to_date_str(week1) week2 = datetime_to_date_str(week2) user_dates[user_id] = ({'cohort': week1, 'precise': date1}, {'cohort': week2, 'precise': date2}) self._expected[week1]['cohort_size'] += 1 self._expected[week2]['cohort_size'] += 1 client.put({CohortTest.EMAIL_STREAM: [{'user': user_id, '@time': datetime_to_kronos_time(date1)}]}) client.put({CohortTest.EMAIL_STREAM: [{'user': user_id, '@time': datetime_to_kronos_time(date2)}]}) # Action stream: Users in group 1 hit the front page w/ 1/5 # percent chance, group 2 with 2/5 chance, etc. Likelihood for # all users on day N is 1/N. for user_id in user_ids: for email_dates in user_dates[user_id]: for day in xrange(CohortTest.ACTION_REPETITION_DAYS): group_probability = ( ((user_id % len(CohortTest.EMAIL_WEEKS)) + 1.0) / len(CohortTest.EMAIL_WEEKS)) day_probability = 1.0 / (day + 1) action_probability = group_probability * day_probability if random() < action_probability: action_date = email_dates['precise'] + timedelta(days=day) self._expected[email_dates['cohort']]['action_dates'][ datetime_to_date_str(action_date)] += 1 client.put({CohortTest.FRONTPAGE_STREAM: [{'user_id': user_id, '@time': action_date}]}) print json.dumps(self._expected, sort_keys=True, indent=2)
def test_lex_sort(self): """ This test ensures that the UUID segment flip enables correct lexicographic sorting of the v1 time UUIDs used. The timespan tested is 230 years so that the high bits in the time UUID must differ. """ seconds = 230 * 365 * 24 * 60 * 60 uuids = [] for idx, sec in enumerate(range(0, seconds, 7250000)): dt = datetime.datetime.now() + datetime.timedelta(seconds=sec) kt = datetime_to_kronos_time(dt) event1 = uuid_from_kronos_time(kt) event2 = uuid_from_kronos_time(kt) events = sorted([event1, event2]) uuids.append(events[0]) uuids.append(events[1]) uuids = [str(uuid) for uuid in uuids] flipped_uuids = [sortable_time_uuid_str(uuid) for uuid in uuids] flipped_uuids = sorted(flipped_uuids) flipped_uuids = [flip_uuid_parts(uuid) for uuid in flipped_uuids] self.assertEqual(uuids, flipped_uuids)
def test_lex_sort(self): """ This test ensures that the UUID segment flip enables correct lexicographic sorting of the v1 time UUIDs used. The timespan tested is 230 years so that the high bits in the time UUID must differ. """ seconds = 230 * 365 * 24 * 60 * 60 uuids = [] for idx, sec in enumerate(range(0, seconds, 7250000)): dt = datetime.datetime.now() + datetime.timedelta(seconds=sec) kt = datetime_to_kronos_time(dt) event1 = uuid_from_kronos_time(kt) event2 = uuid_from_kronos_time(kt) events = sorted([event1, event2]) uuids.append(events[0]) uuids.append(events[1]) uuids = [str(uuid) for uuid in uuids] flipped_uuids = [sortable_time_uuid_str(uuid) for uuid in uuids] flipped_uuids = sorted(flipped_uuids) flipped_uuids = [flip_uuid_parts(uuid) for uuid in flipped_uuids] self.assertEqual(uuids, flipped_uuids)
def put(self, event_dict, namespace=None): """ Sends a dictionary of `event_dict` of the form {stream_name: [event, ...], ...} to the server. """ # Copy the input, in case we need to modify it by adding a timestamp. event_dict = copy.deepcopy(event_dict) # Ensure that all events have a timestamp. timestamp = kronos_time_now() for events in event_dict.itervalues(): for event in events: if TIMESTAMP_FIELD not in event: event[TIMESTAMP_FIELD] = timestamp else: if isinstance(event[TIMESTAMP_FIELD], types.StringTypes): event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD]) if isinstance(event[TIMESTAMP_FIELD], datetime): event[TIMESTAMP_FIELD] = datetime_to_kronos_time( event[TIMESTAMP_FIELD]) event[LIBRARY_FIELD] = { 'version': pykronos.__version__, 'name': 'pykronos' } namespace = namespace or self.namespace if self._blocking: return self._put(namespace, event_dict) else: with self._put_lock: self._put_queue.append((namespace, event_dict))
def put(self, event_dict, namespace=None): """ Sends a dictionary of `event_dict` of the form {stream_name: [event, ...], ...} to the server. """ # Copy the input, in case we need to modify it by adding a timestamp. event_dict = copy.deepcopy(event_dict) # Ensure that all events have a timestamp. timestamp = kronos_time_now() for events in event_dict.itervalues(): for event in events: if TIMESTAMP_FIELD not in event: event[TIMESTAMP_FIELD] = timestamp else: if isinstance(event[TIMESTAMP_FIELD], types.StringTypes): event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD]) if isinstance(event[TIMESTAMP_FIELD], datetime): event[TIMESTAMP_FIELD] = datetime_to_kronos_time( event[TIMESTAMP_FIELD]) event[LIBRARY_FIELD] = { 'version': pykronos.__version__, 'name': 'pykronos' } namespace = namespace or self.namespace if self._blocking: return self._put(namespace, event_dict) else: with self._put_lock: self._put_queue.append((namespace, event_dict))
def get_stream5(self): start = datetime.datetime(2014,3,21) delta = datetime.timedelta(minutes=1) events = [] for i in range(0, 20, 2): events.append({'userId': str(i), 'type': 'a' if i % 2 else 'b', '@time': datetime_to_kronos_time(start)}) start += delta return events
def get_stream4(self): user_ids = ['0', '1'] fuzzy_time = datetime.timedelta(minutes=2) times = [datetime.datetime(2014,3,20) - fuzzy_time, datetime.datetime(2014,3,20)] events = [] for user_id, time in zip(user_ids, times): events.append({'username': user_id, 'type': 'a', '@time': datetime_to_kronos_time(time)}) return events
def get_stream5(self): start = datetime.datetime(2014, 3, 21) delta = datetime.timedelta(minutes=1) events = [] for i in range(0, 20, 2): events.append({ 'userId': str(i), 'type': 'a' if i % 2 else 'b', '@time': datetime_to_kronos_time(start) }) start += delta return events
def _get_timeframe_bounds(self, timeframe, bucket_width): """ Get a `bucket_width` aligned `start_time` and `end_time` from a `timeframe` dict """ if bucket_width: bucket_width_seconds = bucket_width bucket_width = epoch_time_to_kronos_time(bucket_width) # TODO(derek): Potential optimization by setting the end_time equal to the # untrusted_time if end_time > untrusted_time and the results are not being # output to the user (only for caching) if timeframe['mode'] == 'recent': # Set end_time equal to now and align to bucket width end_time = datetime_to_kronos_time(datetime.datetime.now()) original_end_time = end_time duration = get_seconds(timeframe['value'], timeframe['scale']) duration = epoch_time_to_kronos_time(duration) start_time = original_end_time - duration if bucket_width: # Align values to the bucket width # TODO(derek): Warn the user that the timeframe has been altered to fit # the bucket width if (end_time % bucket_width) != 0: end_time += bucket_width - (end_time % bucket_width) if (start_time % bucket_width) != 0: start_time -= (start_time % bucket_width) start = kronos_time_to_datetime(start_time) end = kronos_time_to_datetime(end_time) elif timeframe['mode'] == 'range': end = datetime.datetime.strptime(timeframe['to'], DT_FORMAT) end_seconds = datetime_to_epoch_time(end) start = datetime.datetime.strptime(timeframe['from'], DT_FORMAT) start_seconds = datetime_to_epoch_time(start) if bucket_width: # Align values to the bucket width # TODO(derek): Warn the user that the timeframe has been altered to fit # the bucket width start_bump = start_seconds % bucket_width_seconds start -= datetime.timedelta(seconds=start_bump) if (end_seconds % bucket_width_seconds) != 0: end_bump = bucket_width_seconds - (end_seconds % bucket_width_seconds) end += datetime.timedelta(seconds=end_bump) else: raise ValueError("Timeframe mode must be 'recent' or 'range'") return start, end
def delete(self, stream, start_time, end_time, start_id=None, namespace=None): """ Delete events in the stream with name `stream` that occurred between `start_time` and `end_time`. An optional `start_id` allows the client to delete events starting from an ID rather than a timestamp. """ if isinstance(start_time, types.StringTypes): start_time = parse(start_time) if isinstance(end_time, types.StringTypes): end_time = parse(end_time) if isinstance(start_time, datetime): start_time = datetime_to_kronos_time(start_time) if isinstance(end_time, datetime): end_time = datetime_to_kronos_time(end_time) request_dict = { 'stream': stream, 'end_time': end_time } if start_id: request_dict['start_id'] = start_id else: request_dict['start_time'] = start_time namespace = namespace or self.namespace if namespace is not None: request_dict['namespace'] = namespace response = requests.post(self._delete_url, data=json.dumps(request_dict), stream=True) if response.status_code != requests.codes.ok: raise KronosClientError('Bad server response code %d' % response.status_code) response_dict = response.json() if not response_dict[SUCCESS_FIELD]: raise KronosClientError('Encountered errors %s' % _get_errors(response_dict)) return response_dict
def get_stream4(self): user_ids = ['0', '1'] fuzzy_time = datetime.timedelta(minutes=2) times = [ datetime.datetime(2014, 3, 20) - fuzzy_time, datetime.datetime(2014, 3, 20) ] events = [] for user_id, time in zip(user_ids, times): events.append({ 'username': user_id, 'type': 'a', '@time': datetime_to_kronos_time(time) }) return events
def verify_results(self, result_func, cache, expected_results, expected_computations): with patch.object(cache, '_compute_bucket', wraps=cache._compute_bucket) as mock_method: results = result_func() self.assertEqual(mock_method.call_count, expected_computations) self.assertEqual(len(results), expected_results) result_time = self.start_time for idx, result in enumerate(results): self.assertEqual(result[TIMESTAMP_FIELD], datetime_to_kronos_time(result_time)) self.assertEqual(result['b_sum'], sum([2, 7, 12, 17] + [idx * 4 * ( self.bucket_width.total_seconds() / 60)])) result_time += self.bucket_width
def put(self, event_dict, namespace=None): """ Sends a dictionary of `event_dict` of the form {stream_name: [event, ...], ...} to the server. The `blocking` parameter allows the request to block until the server responds, and returns some information on the response. Here's an example: {u'stream_name_1': 3, u'stream_name_2': 1, u'@took': u'1ms'} -> put 3 events on stream_name_1 -> put 1 event on stream_name_2 -> put took 1ms to complete If `blocking` is false and the process running the client ends before flushing the pending data to the server, you might lose that data. Calling `flush` will block until all pending data has been acknowledged by the server. """ # Copy the input, in case we need to modify it by adding a timestamp. event_dict = copy.deepcopy(event_dict) # Ensure that all events have a timestamp. timestamp = kronos_time_now() for events in event_dict.itervalues(): for event in events: if TIMESTAMP_FIELD not in event: event[TIMESTAMP_FIELD] = timestamp else: if isinstance(event[TIMESTAMP_FIELD], types.StringTypes): event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD]) if isinstance(event[TIMESTAMP_FIELD], datetime): event[TIMESTAMP_FIELD] = datetime_to_kronos_time( event[TIMESTAMP_FIELD]) event[LIBRARY_FIELD] = { 'version': pykronos.__version__, 'name': 'pykronos' } namespace = namespace or self.namespace if self._blocking: return self._put(namespace, event_dict) else: with self._put_lock: self._put_queue.append((namespace, event_dict))
def put(self, event_dict, namespace=None): """ Sends a dictionary of `event_dict` of the form {stream_name: [event, ...], ...} to the server. The `blocking` parameter allows the request to block until the server responds, and returns some information on the response. Here's an example: {u'stream_name_1': 3, u'stream_name_2': 1, u'@took': u'1ms'} -> put 3 events on stream_name_1 -> put 1 event on stream_name_2 -> put took 1ms to complete If `blocking` is false and the process running the client ends before flushing the pending data to the server, you might lose that data. Calling `flush` will block until all pending data has been acknowledged by the server. """ # Copy the input, in case we need to modify it by adding a timestamp. event_dict = copy.deepcopy(event_dict) # Ensure that all events have a timestamp. timestamp = kronos_time_now() for events in event_dict.itervalues(): for event in events: if TIMESTAMP_FIELD not in event: event[TIMESTAMP_FIELD] = timestamp else: if isinstance(event[TIMESTAMP_FIELD], types.StringTypes): event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD]) if isinstance(event[TIMESTAMP_FIELD], datetime): event[TIMESTAMP_FIELD] = datetime_to_kronos_time( event[TIMESTAMP_FIELD]) event[LIBRARY_FIELD] = { 'version': pykronos.__version__, 'name': 'pykronos' } namespace = namespace or self.namespace if self._blocking: return self._put(namespace, event_dict) else: with self._put_lock: self._put_queue.append((namespace, event_dict))
def test_user_id_mapping_missing(self, mock_logging): client = Mock() client.get = Mock(side_effect=[self.get_stream1(), self.get_stream4()]) start = datetime.datetime(2014,3,20) end = datetime.datetime(2014,3,21) step1 = FunnelStep('stream1') step2 = FunnelStep('stream2') with self.assertRaisesRegexp(UnboundLocalError, ("local variable 'user' referenced before " "assignment")): funnel_analyze(client, [step1, step2], start, end, end, {}, None) mock_logging.error.assert_called_with( 'Unable to get field %s on %s from %s', 'userId', 'stream2', {'username': '******', 'type': 'a', '@time': datetime_to_kronos_time(datetime.datetime(2014,3,19,23,58))})
def test_user_id_mapping_missing(self, mock_logging): client = Mock() client.get = Mock(side_effect=[self.get_stream1(), self.get_stream4()]) start = datetime.datetime(2014, 3, 20) end = datetime.datetime(2014, 3, 21) step1 = FunnelStep('stream1') step2 = FunnelStep('stream2') with self.assertRaisesRegexp( UnboundLocalError, ("local variable 'user' referenced before " "assignment")): funnel_analyze(client, [step1, step2], start, end, end, {}, None) mock_logging.error.assert_called_with( 'Unable to get field %s on %s from %s', 'userId', 'stream2', { 'username': '******', 'type': 'a', '@time': datetime_to_kronos_time(datetime.datetime(2014, 3, 19, 23, 58)) })
{'source': 'http://test.com', 'browser': {'name': 'Firefox', 'version': 26}, 'pages': ['page1.html', 'page2.html']}], 'yourproduct.website.clicks': [ {'user': 40, 'num_clicks': 7}, {'user': 42, 'num_clicks': 2}] }) """ ### Optionally Add A Timestamp By default, each event will be timestamped on the client. If you add a `TIMESTAMP_FIELD` argument, you can specify the time at which each event ocurred. """ optional_time = datetime_to_kronos_time(start + timedelta(seconds=5)) kc.put({'yourproduct.website.clicks': [ {'user': 35, 'num_clicks': 10, TIMESTAMP_FIELD: optional_time}]}) """ ## Retrieving Events Retrieving events requires a stream name, a start datetime, and an end datetime. Note that an `ID_FIELD` and `@TIMESTAMP_FIELD` field are attached to each event. The `ID_FIELD` is a UUID1-style identifier with its time bits derived from the timestamp. This allows event IDs to be roughly sortable by the time that they happened while providing a deterministic tiebreaker when two events happened at the same time. """ events = kc.get('yourproduct.website.clicks',
def get(self, stream, start_time, end_time, start_id=None, limit=None, order=ResultOrder.ASCENDING, namespace=None, timeout=None): """ Queries a stream with name `stream` for all events between `start_time` and `end_time` (both inclusive). An optional `start_id` allows the client to restart from a failure, specifying the last ID they read; all events that happened after that ID will be returned. An optional `limit` limits the maximum number of events returned. An optional `order` requests results in `ASCENDING` or `DESCENDING` order. """ if isinstance(start_time, types.StringTypes): start_time = parse(start_time) if isinstance(end_time, types.StringTypes): end_time = parse(end_time) if isinstance(start_time, datetime): start_time = datetime_to_kronos_time(start_time) if isinstance(end_time, datetime): end_time = datetime_to_kronos_time(end_time) request_dict = { 'stream': stream, 'end_time': end_time, 'order': order, } if start_id is not None: request_dict['start_id'] = start_id else: request_dict['start_time'] = start_time if limit is not None: request_dict['limit'] = limit namespace = namespace or self.namespace if namespace is not None: request_dict['namespace'] = namespace errors = [] last_id = None while True: try: response = self._make_request(self._get_url, data=request_dict, stream=True, timeout=timeout) for line in response.iter_lines(chunk_size=self._chunk_size): if line: # Python's json adds a lot of overhead when decoding a large # number of events; ujson fares better. However ujson won't work # on PyPy since it's a C extension. event = ujson.loads(line, precise_float=True) last_id = event[ID_FIELD] yield event break except Exception, e: if isinstance(e, requests.exceptions.Timeout): raise KronosClientError('Request timed out.') errors.append(e) if len(errors) == 10: raise KronosClientError(errors) if last_id is not None: request_dict.pop('start_time', None) request_dict['start_id'] = last_id time.sleep(len(errors) * 0.1)
""" kc.put({'yourproduct.website.pageviews': [ {'source': 'http://test.com', 'browser': {'name': 'Firefox', 'version': 26}, 'pages': ['page1.html', 'page2.html']}], 'yourproduct.website.clicks': [ {'user': 40, 'num_clicks': 7}, {'user': 42, 'num_clicks': 2}]}) ## Optionally add a timestamp """ By default, each event will be timestamped on the client. If you add a `TIMESTAMP_FIELD` argument, you can specify the time at which each event ocurred. """ optional_time = datetime_to_kronos_time(start + timedelta(seconds=5)) kc.put({'yourproduct.website.clicks': [ {'user': 35, 'num_clicks': 10, TIMESTAMP_FIELD: optional_time}]}) # Retrieving data """ Retrieving data requires a stream name, a start datetime, and an end datetime. Note that an `ID_FIELD` and `@TIMESTAMP_FIELD` field are attached to each event. The `ID_FIELD` is a UUID1-style identifier with its time bits derived from the timestamp. This allows event IDs to be roughly sortable by the time that they happened while providing a deterministic tiebreaker when two events happened at the same time. """ events = kc.get('yourproduct.website.clicks', start,
import logging from datetime import timedelta from pykronos.common.event_tools import get_property from pykronos.client import TIMESTAMP_FIELD from pykronos.common.time import datetime_to_kronos_time from pykronos.common.time import timedelta_to_kronos_time from pykronos.common.time import EPOCH log = logging.getLogger(__name__) EARLIEST_TIME = datetime_to_kronos_time(EPOCH) class FilterCache(object): def __init__(self, a_filter): self._filter = a_filter self._filter_results = {} def _caching_filter(self, key): if self._filter: result = self._filter_results.get(key, None) if result is None: result = self._filter(key) self._filter_results[key] = result return result else: return True def get(self, key): return EARLIEST_TIME if self._caching_filter(key) else None
def get(self, stream, start_time, end_time, start_id=None, limit=None, order=ResultOrder.ASCENDING, namespace=None, timeout=None): """ Queries a stream with name `stream` for all events between `start_time` and `end_time` (both inclusive). An optional `start_id` allows the client to restart from a failure, specifying the last ID they read; all events that happened after that ID will be returned. An optional `limit` limits the maximum number of events returned. An optional `order` requests results in `ASCENDING` or `DESCENDING` order. """ if isinstance(start_time, types.StringTypes): start_time = parse(start_time) if isinstance(end_time, types.StringTypes): end_time = parse(end_time) if isinstance(start_time, datetime): start_time = datetime_to_kronos_time(start_time) if isinstance(end_time, datetime): end_time = datetime_to_kronos_time(end_time) request_dict = { 'stream': stream, 'end_time': end_time, 'order': order, } if start_id is not None: request_dict['start_id'] = start_id else: request_dict['start_time'] = start_time if limit is not None: request_dict['limit'] = limit namespace = namespace or self.namespace if namespace is not None: request_dict['namespace'] = namespace errors = [] last_id = None while True: try: response = self._make_request(self._get_url, data=request_dict, stream=True, timeout=timeout) for line in response.iter_lines(chunk_size=self._chunk_size): if line: # Python's json adds a lot of overhead when decoding a large # number of events; ujson fares better. However ujson won't work # on PyPy since it's a C extension. event = ujson.loads(line, precise_float=True) last_id = event[ID_FIELD] yield event break except Exception, e: if isinstance(e, requests.exceptions.Timeout): raise KronosClientError('Request timed out.') errors.append(e) if len(errors) == 10: raise KronosClientError(errors) if last_id is not None: request_dict.pop('start_time', None) request_dict['start_id'] = last_id time.sleep(len(errors) * 0.1)