Exemple #1
0
def _cohort_stream_transform(source, stream, start, end,
                             transform, grouping_key, unit):
  start_stream = KronosSource(source, stream, start, end)
  if transform:
    transformed = transform(start_stream)
  else:
    transformed = start_stream
  projected = Project(transformed,
                      [Property(TIMESTAMP_FIELD,
                                alias=TIMESTAMP_FIELD),
                       Property(grouping_key,
                                alias=grouping_key),
                       Floor([Property(TIMESTAMP_FIELD),
                              Constant(DateUnit.unit_to_kronos_time(unit)),
                              Constant(start)],
                             alias='date')])
  # This leaves us with a single event per (user, unit time) pair.
  aggregated = Aggregate(
    projected,
    GroupBy([Property(grouping_key,
                      alias=grouping_key),
             Property('date',
                      alias='date')]),
    # The first time the user performed the event in that bucket.
    [Min([Property(TIMESTAMP_FIELD)], alias=TIMESTAMP_FIELD)])
  return aggregated
Exemple #2
0
    def test_aggregate(self):
        sums = defaultdict(int)
        for i in xrange(200):
            a = random.randint(0, 2)
            self.kronos_client.put(
                {self.stream: [{
                    constants.TIMESTAMP_FIELD: i,
                    'a': a
                }]})
            sums[50 * (i / 50)] += a
        events = self.query(
            Aggregate(
                Project(KronosSource('kronos', self.stream, 0, 1000), [
                    Floor([Property(constants.TIMESTAMP_FIELD),
                           Constant(50)],
                          alias=constants.TIMESTAMP_FIELD)
                ],
                        merge=True),
                GroupBy(
                    Property(constants.TIMESTAMP_FIELD,
                             alias=constants.TIMESTAMP_FIELD)), [
                                 Count([], alias='count'),
                                 Sum([Property('a')], alias='sum'),
                                 Min([Property('a')], alias='min'),
                                 Max([Property('a')], alias='max'),
                                 Avg([Property('a')], alias='avg')
                             ]).to_dict())
        self.assertEqual(len(events), 200 / 50)
        for event in events:
            self.assertEqual(event[constants.TIMESTAMP_FIELD] % 50, 0)
            self.assertEqual(event['count'], 50)
            self.assertEqual(event['min'], 0)
            self.assertEqual(event['max'], 2)
            self.assertEqual(event['sum'],
                             sums[event[constants.TIMESTAMP_FIELD]])
            self.assertTrue(event['avg'] * 50 > event['sum'] - 0.1)
            self.assertTrue(event['avg'] * 50 < event['sum'] + 0.1)

        events = self.query(
            Aggregate(
                KronosSource('kronos', self.stream, 0, 1000),
                GroupBy(
                    Floor([Property(constants.TIMESTAMP_FIELD),
                           Constant(50)],
                          alias=constants.TIMESTAMP_FIELD)),
                [Count([], alias='count')]).to_dict())
        self.assertEqual(len(events), 200 / 50)
Exemple #3
0
def aggregate(query_plan, operands):
  aggregates = []

  for agg in operands['aggregates']:
    aggregates.append(agg_op(agg['agg_type'], cpf(agg['agg_on']),
                      agg['alias']))

  groups = []
  for group in operands['groups']:
    groups.append(cpf(group['field'], group['alias']))

  group_by = GroupBy(groups)
  return Aggregate(query_plan, group_by, aggregates)
Exemple #4
0
 def test_parsing_and_to_dict(self):
   _property = {'type': 'property', 'name': 'x'}
   constant = {'type': 'constant', 'value': 1}
   function = {'type': 'function', 'name': 'add',
               'arguments': [deepcopy(_property), deepcopy(constant)]}
   self.assertEqual(Value.parse(deepcopy(constant)).to_dict(), constant)
   self.assertEqual(Value.parse(deepcopy(_property)).to_dict(), _property)
   self.assertEqual(Value.parse(deepcopy(function)).to_dict(), function)
   
   kronos = {'type': 'kronos', 'host': 'localhost', 'stream': 'mystream',
             'start_time': 100, 'end_time': 200}
   self.assertEqual(Stream.parse(deepcopy(kronos)).to_dict(), kronos)
   
   condition_lt = {'op': 'lt', 'left': deepcopy(_property),
                   'right': deepcopy(constant)}
   condition_eq = {'op': 'eq', 'left': deepcopy(function),
                   'right': deepcopy(_property)}
   condition_or = {'type': 'or', 'conditions': [deepcopy(condition_lt),
                                                deepcopy(condition_eq)]}
   self.assertEqual(Condition.parse(deepcopy(condition_lt)).to_dict(),
                    condition_lt)
   self.assertEqual(Condition.parse(deepcopy(condition_eq)).to_dict(),
                    condition_eq)
   self.assertEqual(Condition.parse(deepcopy(condition_or)).to_dict(),
                    condition_or)
   
   avg = {'op': 'avg', 'arguments': [deepcopy(_property)], 'alias': 'myavg'}
   count = {'op': 'count', 'alias': 'mycount'}
   self.assertEqual(Aggregator.parse(deepcopy(avg)).to_dict(), avg)
   self.assertEqual(Aggregator.parse(deepcopy(count)).to_dict(), count)
   
   group_by = deepcopy(function)
   group_by['alias'] = 'mygroup'
   group_by = [group_by]
   self.assertEqual(GroupBy.parse(deepcopy(group_by)).to_dict(), group_by)
   
   project = {'type': 'project', 'fields': [deepcopy(_property)],
              'stream': deepcopy(kronos)}
   _filter = {'type': 'filter', 'condition': condition_lt,
             'stream': deepcopy(project)}
   aggregate = {'type': 'aggregate',
                'group_by': deepcopy(group_by),
                'aggregates': [deepcopy(avg), deepcopy(count)],
                'stream': deepcopy(_filter)}
   join = {'type': 'join', 'left': deepcopy(aggregate),
           'right': deepcopy(project), 'condition': deepcopy(condition_or)}
   self.assertEqual(Transform.parse(deepcopy(project)).to_dict(), project)
   self.assertEqual(Transform.parse(deepcopy(_filter)).to_dict(), _filter)
   self.assertEqual(Transform.parse(deepcopy(aggregate)).to_dict(), aggregate)
   self.assertEqual(Transform.parse(deepcopy(join)).to_dict(), join)
Exemple #5
0
  def test_parsing_and_to_dict(self):
    _property = {'type': 'property', 'name': 'x'}
    constant = {'type': 'constant', 'value': 1}
    function = {'type': 'function', 'name': 'add',
                'arguments': [deepcopy(_property), deepcopy(constant)]}
    self.assertEqual(Value.parse(deepcopy(constant)).to_dict(), constant)
    self.assertEqual(Value.parse(deepcopy(_property)).to_dict(), _property)
    self.assertEqual(Value.parse(deepcopy(function)).to_dict(), function)

    kronos = {'type': 'data_access', 'source': 'kronos', 'stream': 'mystream',
              'start_time': 100, 'end_time': 200}
    self.assertEqual(Operator.parse(deepcopy(kronos)).to_dict(), kronos)

    condition_lt = {'op': 'lt', 'left': deepcopy(_property),
                    'right': deepcopy(constant)}
    condition_eq = {'op': 'eq', 'left': deepcopy(function),
                    'right': deepcopy(_property)}
    condition_or = {'type': 'or', 'conditions': [deepcopy(condition_lt),
                                                 deepcopy(condition_eq)]}
    self.assertEqual(Condition.parse(deepcopy(condition_lt)).to_dict(),
                     condition_lt)
    self.assertEqual(Condition.parse(deepcopy(condition_eq)).to_dict(),
                     condition_eq)
    self.assertEqual(Condition.parse(deepcopy(condition_or)).to_dict(),
                     condition_or)

    avg = {'op': 'avg', 'arguments': [deepcopy(_property)], 'alias': 'myavg'}
    count = {'op': 'count', 'alias': 'mycount'}
    self.assertEqual(Aggregator.parse(deepcopy(avg)).to_dict(), avg)
    self.assertEqual(Aggregator.parse(deepcopy(count)).to_dict(), count)

    group_by = deepcopy(function)
    group_by['alias'] = 'mygroup'
    group_by = [group_by]
    self.assertEqual(GroupBy.parse(deepcopy(group_by)).to_dict(), group_by)

    project = {'type': 'project', 'fields': [deepcopy(_property)],
               'source': deepcopy(kronos)}
    _filter = {'type': 'filter', 'condition': condition_lt,
               'source': deepcopy(project)}
    aggregate = {'type': 'aggregate',
                 'group_by': deepcopy(group_by),
                 'aggregates': [deepcopy(avg), deepcopy(count)],
                 'source': deepcopy(_filter)}
    join = {'type': 'join', 'left': deepcopy(aggregate),
            'right': deepcopy(project), 'condition': deepcopy(condition_or)}
    self.assertEqual(Operator.parse(deepcopy(project)).to_dict(), project)
    self.assertEqual(Operator.parse(deepcopy(_filter)).to_dict(), _filter)
    self.assertEqual(Operator.parse(deepcopy(aggregate)).to_dict(), aggregate)
    self.assertEqual(Operator.parse(deepcopy(join)).to_dict(), join)
Exemple #6
0
def aggregate(query_plan, operands):
    aggregates = []

    for agg in operands['aggregates']:
        cpf_type = agg['agg_on']['cpf_type']
        property_name = agg['agg_on'].get('property_name')
        constant_value = agg['agg_on'].get('constant_value')
        empty = (cpf_type == 'property' and not property_name
                 or cpf_type == 'constant' and not constant_value)
        if empty:
            agg_on_cpf = None
        else:
            agg_on_cpf = cpf(agg['agg_on'])
        aggregates.append(agg_op(agg['agg_type'], agg_on_cpf, agg['alias']))

    groups = []
    for group in operands['groups']:
        groups.append(cpf(group['field'], group['alias']))

    group_by = GroupBy(groups)
    return Aggregate(query_plan, group_by, aggregates)
Exemple #7
0
def cohort_queryplan(plan):
  """
  Input:
  {
   'source': 'kronos', # Name of data source from settings
   'cohort':
    {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from.
     'transform': lambda x: x,          # Transformations on the kstream.
     'start': date.now(),               # The day of the first cohort.
     'unit': DateUnit.XX,               # Users are in the same cohort
                                        # if they are in the same day/week.
     'cohorts': 5                       # How many cohorts (days/weeks/months)
                                        # to track.
     'grouping_key': 'user'},           # What key in an event should we tie
                                        # to a key in the action stream?

   'action':
     {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on.
      'transform': lambda x: x               # Transformations on the stream.
      'unit': DateUnit.XX,                   # Track events in day/week/months.
      'repetitions': 14                   # How many days/weeks/months to track.
      'grouping_key': 'user_id'}          # What key in an event should we tie
                                          # to a key in the action stream?
  }

  Output:
  A metis-compatible query plan to return a cohort analysis.
  """
  cohort = plan['cohort']
  action = plan['action']
  source = plan['source']

  # Calculate the start and end dates, in Kronos time, of the
  # beginning and end of the cohort and action streams that will be
  # relevant.
  cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start']))
  cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']})
  cohort_end = cohort['start'] + cohort_span
  action_span = timedelta(**{action['unit']: action['repetitions']})
  action_end = cohort_end + action_span
  cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1
  action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1

  left = _cohort_stream_transform(source,
                                  cohort['stream'], cohort_start, cohort_end,
                                  cohort.get('transform'),
                                  cohort['grouping_key'], cohort['unit'])
  right = _cohort_stream_transform(source,
                                   action['stream'], cohort_start, action_end,
                                   action.get('transform'),
                                   action['grouping_key'], action['unit'])

  additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) *
                            action['repetitions'])

  left.alias = 'cohort'
  right.alias = 'action'

  joined = Join(left,
                right,
                (Condition(Condition.Op.EQ,
                           Property('cohort.%s' % cohort['grouping_key']),
                           Property('action.%s' % action['grouping_key'])) &
                 Condition(Condition.Op.GTE,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Property('cohort.%s' % TIMESTAMP_FIELD)) &
                 Condition(Condition.Op.LT,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Add([Property('cohort.%s' % TIMESTAMP_FIELD),
                                Constant(additional_action_time)]))))

  user_aggregated = Aggregate(
    joined,
    GroupBy([Property('cohort.date', alias=TIMESTAMP_FIELD),
             Property('cohort.%s' % cohort['grouping_key'], alias='group'),
             Floor([Subtract([Property('action.%s' % TIMESTAMP_FIELD),
                              Property('cohort.%s' % TIMESTAMP_FIELD)]),
                    Constant(DateUnit.unit_to_kronos_time(action['unit']))],
                   alias='action_step')]),
    [Count([], alias='count')]
  )

  aggregated = Aggregate(
    user_aggregated,
    GroupBy([Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD),
             Property('action_step', alias='action_step')]),
    [Count([], alias='cohort_actions')])

  # TODO(marcua): Also sum up the cohort sizes, join with the plan.
  return aggregated.to_dict()
Exemple #8
0
 def parse(self, _dict):
     _dict['aggregates'] = map(Aggregator.parse, _dict['aggregates'])
     _dict['group_by'] = GroupBy.parse(_dict['group_by'])
     return Aggregate(**_dict)
 def parse(self, _dict):
   _dict['aggregates'] = map(Aggregator.parse, _dict['aggregates'])
   _dict['group_by'] = GroupBy.parse(_dict['group_by'])
   return Aggregate(**_dict)