Esempio n. 1
0
    def test_stream_inversion_timestamps(self):
        """
    Test that if stream is inverted, timestamp of last action is the
    timestamp of the last action in the previous funnel step.
    """
        from analysis import IdentityDict
        from analysis import _stream_earliest_action

        client = Mock()
        client.get = Mock(side_effect=[self.get_stream5()])

        start = datetime.datetime(2014, 3, 20)
        end = datetime.datetime(2014, 3, 21)
        step = FunnelStep('stream5', invert=True)
        last_user_action = {
            '0': datetime_to_kronos_time(start),
            '1': datetime_to_kronos_time(start)
        }
        step_output = _stream_earliest_action(
            client, step, start, end,
            timedelta_to_kronos_time(datetime.timedelta(minutes=5)),
            last_user_action, {'userId': IdentityDict()})
        user_action = step_output['user_action']
        self.assertEqual(len(user_action), 1)
        self.assertEqual(user_action['1'], datetime_to_kronos_time(start))
Esempio n. 2
0
  def delete(self, stream, start_time, end_time, start_id=None, namespace=None):
    """
    Delete events in the stream with name `stream` that occurred between
    `start_time` and `end_time` (both inclusive).  An optional `start_id` allows
    the client to delete events starting from after an ID rather than starting
    at a timestamp.
    """
    if isinstance(start_time, types.StringTypes):
      start_time = parse(start_time)
    if isinstance(end_time, types.StringTypes):
      end_time = parse(end_time)
    if isinstance(start_time, datetime):
      start_time = datetime_to_kronos_time(start_time)
    if isinstance(end_time, datetime):
      end_time = datetime_to_kronos_time(end_time)
    request_dict = {
      'stream': stream,
      'end_time': end_time
    }
    if start_id:
      request_dict['start_id'] = start_id
    else:
      request_dict['start_time'] = start_time

    namespace = namespace or self.namespace
    if namespace is not None:
      request_dict['namespace'] = namespace

    return self._make_request(self._delete_url, data=request_dict)
  def test_stream_inversion_timestamps(self):
    """
    Test that if stream is inverted, timestamp of last action is the
    timestamp of the last action in the previous funnel step.
    """
    from analysis import IdentityDict
    from analysis import _stream_earliest_action

    client = Mock()
    client.get = Mock(side_effect=[self.get_stream5()])

    start = datetime.datetime(2014,3,20)
    end = datetime.datetime(2014,3,21)
    step = FunnelStep('stream5', invert=True)
    last_user_action = {'0': datetime_to_kronos_time(start),
                        '1': datetime_to_kronos_time(start)}
    step_output = _stream_earliest_action(client, step,
                                          start, end,
                                          timedelta_to_kronos_time(
                                            datetime.timedelta(minutes=5)),
                                          last_user_action,
                                          {'userId': IdentityDict()})
    user_action = step_output['user_action']
    self.assertEqual(len(user_action), 1)
    self.assertEqual(user_action['1'], datetime_to_kronos_time(start))
Esempio n. 4
0
    def delete(self,
               stream,
               start_time,
               end_time,
               start_id=None,
               namespace=None):
        """
    Delete events in the stream with name `stream` that occurred between
    `start_time` and `end_time` (both inclusive).  An optional `start_id` allows
    the client to delete events starting from after an ID rather than starting
    at a timestamp.
    """
        if isinstance(start_time, types.StringTypes):
            start_time = parse(start_time)
        if isinstance(end_time, types.StringTypes):
            end_time = parse(end_time)
        if isinstance(start_time, datetime):
            start_time = datetime_to_kronos_time(start_time)
        if isinstance(end_time, datetime):
            end_time = datetime_to_kronos_time(end_time)
        request_dict = {'stream': stream, 'end_time': end_time}
        if start_id:
            request_dict['start_id'] = start_id
        else:
            request_dict['start_time'] = start_time

        namespace = namespace or self.namespace
        if namespace is not None:
            request_dict['namespace'] = namespace

        return self._make_request(self._delete_url, data=request_dict)
Esempio n. 5
0
def process_args():
  parser = argparse.ArgumentParser()
  parser.add_argument(
    '--kronos-url1',
    required=True,
    help='The first Kronos server to retrieve data from')
  parser.add_argument(
    '--kronos-url2',
    required=True,
    help='The second Kronos server to retrieve data from')
  parser.add_argument(
    '--namespace1',
    help='The namespace to read from the first Kronos server (optional)')
  parser.add_argument(
    '--namespace2',
    help='The namespace to read from the second Kronos server (optional)')
  parser.add_argument(
    '--stream1',
    help='The stream to read from the first Kronos server')
  parser.add_argument(
    '--stream2',
    help='The stream to read from the second Kronos server')
  parser.add_argument(
    '--streams-file',
    help='The name of the file with a stream name per line to copy')
  parser.add_argument(
    '--start',
    required=True,
    help='When to start retreiving? (format: 2003-09-25T10:49:41.5-03:00)')
  parser.add_argument(
    '--end',
    required=True,
    help='When to end retreiving? (format: 2003-09-25T10:49:41.5-03:00)')
  group = parser.add_argument_group(title='sampling arguments',
                                    description=('Only compare samples of '
                                                 'events.'))
  group.add_argument('--num-samples',
                     type=int,
                     help=('Number of samples to compare?'))
  group.add_argument('--sample-interval',
                     type=int,
                     default=3600,
                     help=('Interval of the sample (in seconds). Defaults to '
                           '1 hour (3600) (optional)'))
  args = parser.parse_args()

  if not bool(args.streams_file) ^ bool(args.stream1 and args.stream2):
    print 'Please specify either `stream-file` or both `stream1 and stream2`.'
    sys.exit(1)

  args.start = datetime_to_kronos_time(parse(args.start))
  args.end = datetime_to_kronos_time(parse(args.end))
  args.sample_interval = epoch_time_to_kronos_time(args.sample_interval)
  
  return args
Esempio n. 6
0
 def clear_data(self):
   start = CohortTest.START_DATETIME - timedelta(weeks=2)
   end = (CohortTest.START_DATETIME +
          timedelta(weeks=(2 + len(CohortTest.EMAIL_WEEKS))) +
          timedelta(days=CohortTest.ACTION_REPETITION_DAYS))
   client.delete(CohortTest.EMAIL_STREAM,
                 datetime_to_kronos_time(start),
                 datetime_to_kronos_time(end))
   client.delete(CohortTest.FRONTPAGE_STREAM,
                 datetime_to_kronos_time(start),
                 datetime_to_kronos_time(end))    
Esempio n. 7
0
def process_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--kronos-url1',
                        required=True,
                        help='The first Kronos server to retrieve data from')
    parser.add_argument('--kronos-url2',
                        required=True,
                        help='The second Kronos server to retrieve data from')
    parser.add_argument(
        '--namespace1',
        help='The namespace to read from the first Kronos server (optional)')
    parser.add_argument(
        '--namespace2',
        help='The namespace to read from the second Kronos server (optional)')
    parser.add_argument('--stream1',
                        help='The stream to read from the first Kronos server')
    parser.add_argument(
        '--stream2', help='The stream to read from the second Kronos server')
    parser.add_argument(
        '--streams-file',
        help='The name of the file with a stream name per line to copy')
    parser.add_argument(
        '--start',
        required=True,
        help='When to start retreiving? (format: 2003-09-25T10:49:41.5-03:00)')
    parser.add_argument(
        '--end',
        required=True,
        help='When to end retreiving? (format: 2003-09-25T10:49:41.5-03:00)')
    group = parser.add_argument_group(title='sampling arguments',
                                      description=('Only compare samples of '
                                                   'events.'))
    group.add_argument('--num-samples',
                       type=int,
                       help=('Number of samples to compare?'))
    group.add_argument(
        '--sample-interval',
        type=int,
        default=3600,
        help=('Interval of the sample (in seconds). Defaults to '
              '1 hour (3600) (optional)'))
    args = parser.parse_args()

    if not bool(args.streams_file) ^ bool(args.stream1 and args.stream2):
        print 'Please specify either `stream-file` or both `stream1 and stream2`.'
        sys.exit(1)

    args.start = datetime_to_kronos_time(parse(args.start))
    args.end = datetime_to_kronos_time(parse(args.end))
    args.sample_interval = epoch_time_to_kronos_time(args.sample_interval)

    return args
Esempio n. 8
0
  def generate_data(self):
    user_ids = range(700)

    # Email stream: Users in groups 1-5 get an email in weeks 1 and 3,
    # 2 and 4, 3 and 5, 4 and 1, 5 and 2, respectively.
    user_dates = {}
    # Fill in expected_output, which is of
    # the form: {cohort_date: {cohort_size: NN,
    #                          action_dates: {action_date: num_actions}}}
    self._expected = defaultdict(
      lambda: {'cohort_size': 0, 'action_dates': defaultdict(int)})
    for user_id in user_ids:
      weeks1, weeks2 = CohortTest.EMAIL_WEEKS[
        user_id % len(CohortTest.EMAIL_WEEKS)]
      week1 = CohortTest.START_DATETIME + timedelta(weeks=weeks1)
      date1 = week1 + timedelta(hours=randint(0, 72))
      week2 = CohortTest.START_DATETIME + timedelta(weeks=weeks2)
      date2 = week2 + timedelta(hours=randint(0, 72))
      week1 = datetime_to_date_str(week1)
      week2 = datetime_to_date_str(week2)      
      user_dates[user_id] = ({'cohort': week1, 'precise': date1},
                             {'cohort': week2, 'precise': date2})
      self._expected[week1]['cohort_size'] += 1
      self._expected[week2]['cohort_size'] += 1
      client.put({CohortTest.EMAIL_STREAM: [{'user': user_id,
                                  '@time': datetime_to_kronos_time(date1)}]})
      client.put({CohortTest.EMAIL_STREAM: [{'user': user_id,
                                  '@time': datetime_to_kronos_time(date2)}]})      
    
    # Action stream: Users in group 1 hit the front page w/ 1/5
    # percent chance, group 2 with 2/5 chance, etc.  Likelihood for
    # all users on day N is 1/N.
    for user_id in user_ids:
      for email_dates in user_dates[user_id]:
        for day in xrange(CohortTest.ACTION_REPETITION_DAYS):
          group_probability = (
            ((user_id % len(CohortTest.EMAIL_WEEKS)) + 1.0) /
            len(CohortTest.EMAIL_WEEKS))
          day_probability = 1.0 / (day + 1)
          action_probability = group_probability * day_probability
          if random() < action_probability:
            action_date = email_dates['precise'] + timedelta(days=day)
            self._expected[email_dates['cohort']]['action_dates'][
              datetime_to_date_str(action_date)] += 1
            client.put({CohortTest.FRONTPAGE_STREAM: [{'user_id': user_id,
                                            '@time': action_date}]})

    print json.dumps(self._expected, sort_keys=True, indent=2)
Esempio n. 9
0
  def test_lex_sort(self):
    """
    This test ensures that the UUID segment flip enables correct lexicographic
    sorting of the v1 time UUIDs used.

    The timespan tested is 230 years so that the high bits in the time UUID must
    differ.
    """
    seconds = 230 * 365 * 24 * 60 * 60

    uuids = []
    for idx, sec in enumerate(range(0, seconds, 7250000)):
      dt = datetime.datetime.now() + datetime.timedelta(seconds=sec)
      kt = datetime_to_kronos_time(dt)
      event1 = uuid_from_kronos_time(kt)
      event2 = uuid_from_kronos_time(kt)
      events = sorted([event1, event2])
      uuids.append(events[0])
      uuids.append(events[1])

    uuids = [str(uuid) for uuid in uuids]
    flipped_uuids = [sortable_time_uuid_str(uuid) for uuid in uuids] 
    flipped_uuids = sorted(flipped_uuids)
    flipped_uuids = [flip_uuid_parts(uuid) for uuid in flipped_uuids]

    self.assertEqual(uuids, flipped_uuids)
Esempio n. 10
0
    def test_lex_sort(self):
        """
    This test ensures that the UUID segment flip enables correct lexicographic
    sorting of the v1 time UUIDs used.

    The timespan tested is 230 years so that the high bits in the time UUID must
    differ.
    """
        seconds = 230 * 365 * 24 * 60 * 60

        uuids = []
        for idx, sec in enumerate(range(0, seconds, 7250000)):
            dt = datetime.datetime.now() + datetime.timedelta(seconds=sec)
            kt = datetime_to_kronos_time(dt)
            event1 = uuid_from_kronos_time(kt)
            event2 = uuid_from_kronos_time(kt)
            events = sorted([event1, event2])
            uuids.append(events[0])
            uuids.append(events[1])

        uuids = [str(uuid) for uuid in uuids]
        flipped_uuids = [sortable_time_uuid_str(uuid) for uuid in uuids]
        flipped_uuids = sorted(flipped_uuids)
        flipped_uuids = [flip_uuid_parts(uuid) for uuid in flipped_uuids]

        self.assertEqual(uuids, flipped_uuids)
Esempio n. 11
0
  def put(self, event_dict, namespace=None):
    """
    Sends a dictionary of `event_dict` of the form {stream_name:
    [event, ...], ...}  to the server.
    """
    # Copy the input, in case we need to modify it by adding a timestamp.
    event_dict = copy.deepcopy(event_dict)

    # Ensure that all events have a timestamp.
    timestamp = kronos_time_now()
    for events in event_dict.itervalues():
      for event in events:
        if TIMESTAMP_FIELD not in event:
          event[TIMESTAMP_FIELD] = timestamp
        else:
          if isinstance(event[TIMESTAMP_FIELD], types.StringTypes):
            event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD])
          if isinstance(event[TIMESTAMP_FIELD], datetime):
            event[TIMESTAMP_FIELD] = datetime_to_kronos_time(
              event[TIMESTAMP_FIELD])
        event[LIBRARY_FIELD] = {
          'version': pykronos.__version__,
          'name': 'pykronos'
        }

    namespace = namespace or self.namespace

    if self._blocking:
      return self._put(namespace, event_dict)
    else:
      with self._put_lock:
        self._put_queue.append((namespace, event_dict))
Esempio n. 12
0
    def put(self, event_dict, namespace=None):
        """
    Sends a dictionary of `event_dict` of the form {stream_name:
    [event, ...], ...}  to the server.
    """
        # Copy the input, in case we need to modify it by adding a timestamp.
        event_dict = copy.deepcopy(event_dict)

        # Ensure that all events have a timestamp.
        timestamp = kronos_time_now()
        for events in event_dict.itervalues():
            for event in events:
                if TIMESTAMP_FIELD not in event:
                    event[TIMESTAMP_FIELD] = timestamp
                else:
                    if isinstance(event[TIMESTAMP_FIELD], types.StringTypes):
                        event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD])
                    if isinstance(event[TIMESTAMP_FIELD], datetime):
                        event[TIMESTAMP_FIELD] = datetime_to_kronos_time(
                            event[TIMESTAMP_FIELD])
                event[LIBRARY_FIELD] = {
                    'version': pykronos.__version__,
                    'name': 'pykronos'
                }

        namespace = namespace or self.namespace

        if self._blocking:
            return self._put(namespace, event_dict)
        else:
            with self._put_lock:
                self._put_queue.append((namespace, event_dict))
Esempio n. 13
0
 def get_stream5(self):
   start = datetime.datetime(2014,3,21)
   delta = datetime.timedelta(minutes=1)
   events = []
   for i in range(0, 20, 2):
     events.append({'userId': str(i),
                    'type': 'a' if i % 2 else 'b',
                    '@time': datetime_to_kronos_time(start)})
     start += delta
   return events
Esempio n. 14
0
 def get_stream4(self):
   user_ids = ['0', '1']
   fuzzy_time = datetime.timedelta(minutes=2)
   times = [datetime.datetime(2014,3,20) - fuzzy_time,
            datetime.datetime(2014,3,20)]
   events = []
   for user_id, time in zip(user_ids, times):
     events.append({'username': user_id,
                    'type': 'a',
                    '@time': datetime_to_kronos_time(time)})
   return events
Esempio n. 15
0
 def get_stream5(self):
     start = datetime.datetime(2014, 3, 21)
     delta = datetime.timedelta(minutes=1)
     events = []
     for i in range(0, 20, 2):
         events.append({
             'userId': str(i),
             'type': 'a' if i % 2 else 'b',
             '@time': datetime_to_kronos_time(start)
         })
         start += delta
     return events
Esempio n. 16
0
  def _get_timeframe_bounds(self, timeframe, bucket_width):
    """
    Get a `bucket_width` aligned `start_time` and `end_time` from a
    `timeframe` dict
    """
    if bucket_width:
      bucket_width_seconds = bucket_width
      bucket_width = epoch_time_to_kronos_time(bucket_width)

    # TODO(derek): Potential optimization by setting the end_time equal to the
    # untrusted_time if end_time > untrusted_time and the results are not being
    # output to the user (only for caching)
    if timeframe['mode'] == 'recent':
      # Set end_time equal to now and align to bucket width
      end_time = datetime_to_kronos_time(datetime.datetime.now())
      original_end_time = end_time
      duration = get_seconds(timeframe['value'], timeframe['scale'])
      duration = epoch_time_to_kronos_time(duration)
      start_time = original_end_time - duration

      if bucket_width:
        # Align values to the bucket width
        # TODO(derek): Warn the user that the timeframe has been altered to fit
        # the bucket width
        if (end_time % bucket_width) != 0:
          end_time += bucket_width - (end_time % bucket_width)

        if (start_time % bucket_width) != 0:
          start_time -= (start_time % bucket_width)

      start = kronos_time_to_datetime(start_time)
      end = kronos_time_to_datetime(end_time)
    elif timeframe['mode'] == 'range':
      end = datetime.datetime.strptime(timeframe['to'], DT_FORMAT)
      end_seconds = datetime_to_epoch_time(end)

      start = datetime.datetime.strptime(timeframe['from'], DT_FORMAT)
      start_seconds = datetime_to_epoch_time(start)

      if bucket_width:
        # Align values to the bucket width
        # TODO(derek): Warn the user that the timeframe has been altered to fit
        # the bucket width
        start_bump = start_seconds % bucket_width_seconds
        start -= datetime.timedelta(seconds=start_bump)
        if (end_seconds % bucket_width_seconds) != 0:
          end_bump = bucket_width_seconds - (end_seconds % bucket_width_seconds)
          end += datetime.timedelta(seconds=end_bump)
    else:
      raise ValueError("Timeframe mode must be 'recent' or 'range'")

    return start, end
Esempio n. 17
0
  def delete(self, stream, start_time, end_time, start_id=None, namespace=None):
    """
    Delete events in the stream with name `stream` that occurred between
    `start_time` and `end_time`.  An optional `start_id` allows the
    client to delete events starting from an ID rather than a timestamp.
    """
    if isinstance(start_time, types.StringTypes):
      start_time = parse(start_time)
    if isinstance(end_time, types.StringTypes):
      end_time = parse(end_time)      
    if isinstance(start_time, datetime):
      start_time = datetime_to_kronos_time(start_time)
    if isinstance(end_time, datetime):
      end_time = datetime_to_kronos_time(end_time)
    request_dict = {
      'stream': stream,
      'end_time': end_time
    }
    if start_id:
      request_dict['start_id'] = start_id
    else:
      request_dict['start_time'] = start_time

    namespace = namespace or self.namespace
    if namespace is not None:
      request_dict['namespace'] = namespace

    response = requests.post(self._delete_url,
                             data=json.dumps(request_dict),
                             stream=True)
    if response.status_code != requests.codes.ok:
      raise KronosClientError('Bad server response code %d' %
                              response.status_code)
    response_dict = response.json()
    if not response_dict[SUCCESS_FIELD]:
      raise KronosClientError('Encountered errors %s' %
                              _get_errors(response_dict))
    return response_dict
Esempio n. 18
0
 def get_stream4(self):
     user_ids = ['0', '1']
     fuzzy_time = datetime.timedelta(minutes=2)
     times = [
         datetime.datetime(2014, 3, 20) - fuzzy_time,
         datetime.datetime(2014, 3, 20)
     ]
     events = []
     for user_id, time in zip(user_ids, times):
         events.append({
             'username': user_id,
             'type': 'a',
             '@time': datetime_to_kronos_time(time)
         })
     return events
Esempio n. 19
0
  def verify_results(self, result_func, cache, expected_results,
                     expected_computations):
    with patch.object(cache, '_compute_bucket',
                      wraps=cache._compute_bucket) as mock_method:
      results = result_func()
      self.assertEqual(mock_method.call_count, expected_computations)

    self.assertEqual(len(results), expected_results)
    result_time = self.start_time
    for idx, result in enumerate(results):
      self.assertEqual(result[TIMESTAMP_FIELD],
                       datetime_to_kronos_time(result_time))
      self.assertEqual(result['b_sum'], sum([2, 7, 12, 17] + [idx * 4 * (
              self.bucket_width.total_seconds() / 60)]))
      result_time += self.bucket_width
Esempio n. 20
0
    def put(self, event_dict, namespace=None):
        """
    Sends a dictionary of `event_dict` of the form {stream_name:
    [event, ...], ...}  to the server.

    The `blocking` parameter allows the request to block until the
    server responds, and returns some information on the response.
    Here's an example:

    {u'stream_name_1': 3, u'stream_name_2': 1, u'@took': u'1ms'}
      -> put 3 events on stream_name_1
      -> put 1 event on stream_name_2
      -> put took 1ms to complete

    If `blocking` is false and the process running the client ends
    before flushing the pending data to the server, you might lose
    that data.  Calling `flush` will block until all pending data has
    been acknowledged by the server.
    """
        # Copy the input, in case we need to modify it by adding a timestamp.
        event_dict = copy.deepcopy(event_dict)

        # Ensure that all events have a timestamp.
        timestamp = kronos_time_now()
        for events in event_dict.itervalues():
            for event in events:
                if TIMESTAMP_FIELD not in event:
                    event[TIMESTAMP_FIELD] = timestamp
                else:
                    if isinstance(event[TIMESTAMP_FIELD], types.StringTypes):
                        event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD])
                    if isinstance(event[TIMESTAMP_FIELD], datetime):
                        event[TIMESTAMP_FIELD] = datetime_to_kronos_time(
                            event[TIMESTAMP_FIELD])
                event[LIBRARY_FIELD] = {
                    'version': pykronos.__version__,
                    'name': 'pykronos'
                }

        namespace = namespace or self.namespace

        if self._blocking:
            return self._put(namespace, event_dict)
        else:
            with self._put_lock:
                self._put_queue.append((namespace, event_dict))
Esempio n. 21
0
  def put(self, event_dict, namespace=None):
    """
    Sends a dictionary of `event_dict` of the form {stream_name:
    [event, ...], ...}  to the server.

    The `blocking` parameter allows the request to block until the
    server responds, and returns some information on the response.
    Here's an example:

    {u'stream_name_1': 3, u'stream_name_2': 1, u'@took': u'1ms'}
      -> put 3 events on stream_name_1
      -> put 1 event on stream_name_2
      -> put took 1ms to complete

    If `blocking` is false and the process running the client ends
    before flushing the pending data to the server, you might lose
    that data.  Calling `flush` will block until all pending data has
    been acknowledged by the server.
    """
    # Copy the input, in case we need to modify it by adding a timestamp.
    event_dict = copy.deepcopy(event_dict)

    # Ensure that all events have a timestamp.
    timestamp = kronos_time_now()
    for events in event_dict.itervalues():
      for event in events:
        if TIMESTAMP_FIELD not in event:
          event[TIMESTAMP_FIELD] = timestamp
        else:
          if isinstance(event[TIMESTAMP_FIELD], types.StringTypes):
            event[TIMESTAMP_FIELD] = parse(event[TIMESTAMP_FIELD])
          if isinstance(event[TIMESTAMP_FIELD], datetime):
            event[TIMESTAMP_FIELD] = datetime_to_kronos_time(
              event[TIMESTAMP_FIELD])
        event[LIBRARY_FIELD] = {
          'version': pykronos.__version__,
          'name': 'pykronos'
        }

    namespace = namespace or self.namespace

    if self._blocking:
      return self._put(namespace, event_dict)
    else:
      with self._put_lock:
        self._put_queue.append((namespace, event_dict))
Esempio n. 22
0
  def test_user_id_mapping_missing(self, mock_logging):
    client = Mock()
    client.get = Mock(side_effect=[self.get_stream1(),
                                   self.get_stream4()])

    start = datetime.datetime(2014,3,20)
    end = datetime.datetime(2014,3,21)
    step1 = FunnelStep('stream1')
    step2 = FunnelStep('stream2')
    with self.assertRaisesRegexp(UnboundLocalError,
                                 ("local variable 'user' referenced before "
                                  "assignment")):
        funnel_analyze(client, [step1, step2],
                       start, end, end, {}, None)
    mock_logging.error.assert_called_with(
        'Unable to get field %s on %s from %s', 'userId',
        'stream2',
        {'username': '******',
         'type': 'a',
         '@time': datetime_to_kronos_time(datetime.datetime(2014,3,19,23,58))})
Esempio n. 23
0
    def test_user_id_mapping_missing(self, mock_logging):
        client = Mock()
        client.get = Mock(side_effect=[self.get_stream1(), self.get_stream4()])

        start = datetime.datetime(2014, 3, 20)
        end = datetime.datetime(2014, 3, 21)
        step1 = FunnelStep('stream1')
        step2 = FunnelStep('stream2')
        with self.assertRaisesRegexp(
                UnboundLocalError, ("local variable 'user' referenced before "
                                    "assignment")):
            funnel_analyze(client, [step1, step2], start, end, end, {}, None)
        mock_logging.error.assert_called_with(
            'Unable to get field %s on %s from %s', 'userId', 'stream2', {
                'username':
                '******',
                'type':
                'a',
                '@time':
                datetime_to_kronos_time(datetime.datetime(2014, 3, 19, 23, 58))
            })
Esempio n. 24
0
      {'source': 'http://test.com',
       'browser': {'name': 'Firefox', 'version': 26},
       'pages': ['page1.html', 'page2.html']}],
   'yourproduct.website.clicks': [
      {'user': 40, 'num_clicks': 7},
      {'user': 42, 'num_clicks': 2}]
   })

"""
### Optionally Add A Timestamp

By default, each event will be timestamped on the client.  If you add
a `TIMESTAMP_FIELD` argument, you can specify the time at which each
event ocurred.
"""
optional_time = datetime_to_kronos_time(start + timedelta(seconds=5))
kc.put({'yourproduct.website.clicks': [
  {'user': 35, 'num_clicks': 10, TIMESTAMP_FIELD: optional_time}]})


"""
## Retrieving Events

Retrieving events requires a stream name, a start datetime, and an end
datetime.  Note that an `ID_FIELD` and `@TIMESTAMP_FIELD` field are
attached to each event.  The `ID_FIELD` is a UUID1-style identifier
with its time bits derived from the timestamp.  This allows event IDs
to be roughly sortable by the time that they happened while providing
a deterministic tiebreaker when two events happened at the same time.
"""
events = kc.get('yourproduct.website.clicks',
Esempio n. 25
0
    def get(self,
            stream,
            start_time,
            end_time,
            start_id=None,
            limit=None,
            order=ResultOrder.ASCENDING,
            namespace=None,
            timeout=None):
        """
    Queries a stream with name `stream` for all events between `start_time` and
    `end_time` (both inclusive).  An optional `start_id` allows the client to
    restart from a failure, specifying the last ID they read; all events that
    happened after that ID will be returned. An optional `limit` limits the
    maximum number of events returned.  An optional `order` requests results in
    `ASCENDING` or `DESCENDING` order.
    """
        if isinstance(start_time, types.StringTypes):
            start_time = parse(start_time)
        if isinstance(end_time, types.StringTypes):
            end_time = parse(end_time)
        if isinstance(start_time, datetime):
            start_time = datetime_to_kronos_time(start_time)
        if isinstance(end_time, datetime):
            end_time = datetime_to_kronos_time(end_time)

        request_dict = {
            'stream': stream,
            'end_time': end_time,
            'order': order,
        }
        if start_id is not None:
            request_dict['start_id'] = start_id
        else:
            request_dict['start_time'] = start_time

        if limit is not None:
            request_dict['limit'] = limit

        namespace = namespace or self.namespace
        if namespace is not None:
            request_dict['namespace'] = namespace

        errors = []
        last_id = None
        while True:
            try:
                response = self._make_request(self._get_url,
                                              data=request_dict,
                                              stream=True,
                                              timeout=timeout)
                for line in response.iter_lines(chunk_size=self._chunk_size):
                    if line:
                        # Python's json adds a lot of overhead when decoding a large
                        # number of events; ujson fares better. However ujson won't work
                        # on PyPy since it's a C extension.
                        event = ujson.loads(line, precise_float=True)
                        last_id = event[ID_FIELD]
                        yield event
                break
            except Exception, e:
                if isinstance(e, requests.exceptions.Timeout):
                    raise KronosClientError('Request timed out.')
                errors.append(e)
                if len(errors) == 10:
                    raise KronosClientError(errors)
                if last_id is not None:
                    request_dict.pop('start_time', None)
                    request_dict['start_id'] = last_id
                time.sleep(len(errors) * 0.1)
Esempio n. 26
0
"""
kc.put({'yourproduct.website.pageviews': [
         {'source': 'http://test.com',
          'browser': {'name': 'Firefox', 'version': 26},
          'pages': ['page1.html', 'page2.html']}],
        'yourproduct.website.clicks': [
         {'user': 40, 'num_clicks': 7},          
         {'user': 42, 'num_clicks': 2}]})

## Optionally add a timestamp
"""
By default, each event will be timestamped on the client.  If you add
a `TIMESTAMP_FIELD` argument, you can specify the time at which each
event ocurred.
"""
optional_time = datetime_to_kronos_time(start + timedelta(seconds=5))
kc.put({'yourproduct.website.clicks': [
  {'user': 35, 'num_clicks': 10, TIMESTAMP_FIELD: optional_time}]})


# Retrieving data
"""
Retrieving data requires a stream name, a start datetime, and an end
datetime.  Note that an `ID_FIELD` and `@TIMESTAMP_FIELD` field are
attached to each event.  The `ID_FIELD` is a UUID1-style identifier
with its time bits derived from the timestamp.  This allows event IDs
to be roughly sortable by the time that they happened while providing
a deterministic tiebreaker when two events happened at the same time.
"""
events = kc.get('yourproduct.website.clicks',
                start,
Esempio n. 27
0
import logging
from datetime import timedelta

from pykronos.common.event_tools import get_property
from pykronos.client import TIMESTAMP_FIELD
from pykronos.common.time import datetime_to_kronos_time
from pykronos.common.time import timedelta_to_kronos_time
from pykronos.common.time import EPOCH

log = logging.getLogger(__name__)

EARLIEST_TIME = datetime_to_kronos_time(EPOCH)


class FilterCache(object):
    def __init__(self, a_filter):
        self._filter = a_filter
        self._filter_results = {}

    def _caching_filter(self, key):
        if self._filter:
            result = self._filter_results.get(key, None)
            if result is None:
                result = self._filter(key)
                self._filter_results[key] = result
            return result
        else:
            return True

    def get(self, key):
        return EARLIEST_TIME if self._caching_filter(key) else None
Esempio n. 28
0
  def get(self, stream, start_time, end_time, start_id=None, limit=None,
          order=ResultOrder.ASCENDING, namespace=None, timeout=None):
    """
    Queries a stream with name `stream` for all events between `start_time` and
    `end_time` (both inclusive).  An optional `start_id` allows the client to
    restart from a failure, specifying the last ID they read; all events that
    happened after that ID will be returned. An optional `limit` limits the
    maximum number of events returned.  An optional `order` requests results in
    `ASCENDING` or `DESCENDING` order.
    """
    if isinstance(start_time, types.StringTypes):
      start_time = parse(start_time)
    if isinstance(end_time, types.StringTypes):
      end_time = parse(end_time)
    if isinstance(start_time, datetime):
      start_time = datetime_to_kronos_time(start_time)
    if isinstance(end_time, datetime):
      end_time = datetime_to_kronos_time(end_time)

    request_dict = {
      'stream': stream,
      'end_time': end_time,
      'order': order,
    }
    if start_id is not None:
      request_dict['start_id'] = start_id
    else:
      request_dict['start_time'] = start_time

    if limit is not None:
      request_dict['limit'] = limit

    namespace = namespace or self.namespace
    if namespace is not None:
      request_dict['namespace'] = namespace

    errors = []
    last_id = None
    while True:
      try:
        response = self._make_request(self._get_url,
                                      data=request_dict,
                                      stream=True,
                                      timeout=timeout)
        for line in response.iter_lines(chunk_size=self._chunk_size):
          if line:
            # Python's json adds a lot of overhead when decoding a large
            # number of events; ujson fares better. However ujson won't work
            # on PyPy since it's a C extension.
            event = ujson.loads(line, precise_float=True)
            last_id = event[ID_FIELD]
            yield event
        break
      except Exception, e:
        if isinstance(e, requests.exceptions.Timeout):
          raise KronosClientError('Request timed out.')
        errors.append(e)
        if len(errors) == 10:
          raise KronosClientError(errors)
        if last_id is not None:
          request_dict.pop('start_time', None)
          request_dict['start_id'] = last_id
        time.sleep(len(errors) * 0.1)