Beispiel #1
0
def _date_trunc(value, timeframe):
  """
  A date flooring function.

  Returns the closest datetime to the current one that aligns to timeframe.
  For example, _date_trunc('2014-08-13 05:00:00', DateTrunc.Unit.MONTH)
  will return a Kronos time representing 2014-08-01 00:00:00.
  """
  if isinstance(value, types.StringTypes):
    value = parse(value)
    return_as_str = True
  else:
    value = kronos_time_to_datetime(value)
    return_as_str = False
  timeframes = {
    DateTrunc.Unit.SECOND: (lambda dt:
                            dt - timedelta(microseconds=dt.microsecond)),
    DateTrunc.Unit.MINUTE: (lambda dt:
                            dt - timedelta(seconds=dt.second,
                                           microseconds=dt.microsecond)),
    DateTrunc.Unit.HOUR: (lambda dt:
                          dt - timedelta(minutes=dt.minute,
                                         seconds=dt.second,
                                         microseconds=dt.microsecond)),
    DateTrunc.Unit.DAY: lambda dt: dt.date(),
    DateTrunc.Unit.WEEK: lambda dt: dt.date() - timedelta(days=dt.weekday()),
    DateTrunc.Unit.MONTH: lambda dt: datetime(dt.year, dt.month, 1),
    DateTrunc.Unit.YEAR: lambda dt: datetime(dt.year, 1, 1)
  }
  value = timeframes[timeframe](value)
  if return_as_str:
    return value.isoformat()
  return datetime_to_kronos_time(value)
Beispiel #2
0
def _date_trunc(value, timeframe):
  """
  A date flooring function.

  Returns the closest datetime to the current one that aligns to timeframe.
  For example, _date_trunc('2014-08-13 05:00:00', DateTrunc.Unit.MONTH)
  will return a Kronos time representing 2014-08-01 00:00:00.
  """
  if isinstance(value, types.StringTypes):
    value = parse(value)
    return_as_str = True
  else:
    value = kronos_time_to_datetime(value)
    return_as_str = False
  timeframes = {
    DateTrunc.Unit.SECOND: (lambda dt:
                            dt - timedelta(microseconds=dt.microsecond)),
    DateTrunc.Unit.MINUTE: (lambda dt:
                            dt - timedelta(seconds=dt.second,
                                           microseconds=dt.microsecond)),
    DateTrunc.Unit.HOUR: (lambda dt:
                          dt - timedelta(minutes=dt.minute,
                                         seconds=dt.second,
                                         microseconds=dt.microsecond)),
    DateTrunc.Unit.DAY: lambda dt: dt.date(),
    DateTrunc.Unit.WEEK: lambda dt: dt.date() - timedelta(days=dt.weekday()),
    DateTrunc.Unit.MONTH: lambda dt: datetime(dt.year, dt.month, 1),
    DateTrunc.Unit.YEAR: lambda dt: datetime(dt.year, 1, 1)
  }
  value = timeframes[timeframe](value)
  if return_as_str:
    return value.isoformat()
  return datetime_to_kronos_time(value)
Beispiel #3
0
def cohort_queryplan(plan):
  """
  Input:
  {
   'source': 'kronos', # Name of data source from settings
   'cohort':
    {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from.
     'transform': lambda x: x,          # Transformations on the kstream.
     'start': date.now(),               # The day of the first cohort.
     'unit': DateUnit.XX,               # Users are in the same cohort
                                        # if they are in the same day/week.
     'cohorts': 5                       # How many cohorts (days/weeks/months)
                                        # to track.
     'grouping_key': 'user'},           # What key in an event should we tie
                                        # to a key in the action stream?

   'action':
     {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on.
      'transform': lambda x: x               # Transformations on the stream.
      'unit': DateUnit.XX,                   # Track events in day/week/months.
      'repetitions': 14                   # How many days/weeks/months to track.
      'grouping_key': 'user_id'}          # What key in an event should we tie
                                          # to a key in the action stream?
  }

  Output:
  A metis-compatible query plan to return a cohort analysis.
  """
  cohort = plan['cohort']
  action = plan['action']
  source = plan['source']

  # Calculate the start and end dates, in Kronos time, of the
  # beginning and end of the cohort and action streams that will be
  # relevant.
  cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start']))
  cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']})
  cohort_end = cohort['start'] + cohort_span
  action_span = timedelta(**{action['unit']: action['repetitions']})
  action_end = cohort_end + action_span
  cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1
  action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1

  left = _cohort_stream_transform(source,
                                  cohort['stream'], cohort_start, cohort_end,
                                  cohort.get('transform'),
                                  cohort['grouping_key'], cohort['unit'])
  right = _cohort_stream_transform(source,
                                   action['stream'], cohort_start, action_end,
                                   action.get('transform'),
                                   action['grouping_key'], action['unit'])

  additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) *
                            action['repetitions'])

  left.alias = 'cohort'
  right.alias = 'action'

  joined = Join(left,
                right,
                (Condition(Condition.Op.EQ,
                           Property('cohort.%s' % cohort['grouping_key']),
                           Property('action.%s' % action['grouping_key'])) &
                 Condition(Condition.Op.GTE,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Property('cohort.%s' % TIMESTAMP_FIELD)) &
                 Condition(Condition.Op.LT,
                           Property('action.%s' % TIMESTAMP_FIELD),
                           Add([Property('cohort.%s' % TIMESTAMP_FIELD),
                                Constant(additional_action_time)]))))

  user_aggregated = Aggregate(
    joined,
    GroupBy([Property('cohort.date', alias=TIMESTAMP_FIELD),
             Property('cohort.%s' % cohort['grouping_key'], alias='group'),
             Floor([Subtract([Property('action.%s' % TIMESTAMP_FIELD),
                              Property('cohort.%s' % TIMESTAMP_FIELD)]),
                    Constant(DateUnit.unit_to_kronos_time(action['unit']))],
                   alias='action_step')]),
    [Count([], alias='count')]
  )

  aggregated = Aggregate(
    user_aggregated,
    GroupBy([Property(TIMESTAMP_FIELD, alias=TIMESTAMP_FIELD),
             Property('action_step', alias='action_step')]),
    [Count([], alias='cohort_actions')])

  # TODO(marcua): Also sum up the cohort sizes, join with the plan.
  return aggregated.to_dict()
  def generate_data(self):
    user_ids = range(CohortTestCase.NUM_USERS)

    # Email stream: Users in groups 1-5 get an email in weeks 1 and 3,
    # 2 and 4, 3 and 5, 4 and 1, 5 and 2, respectively.
    # See `EMAIL_WEEKS`.
    user_dates = {}

    # Fill in expected_output, which is of
    # the form: {cohort_date: {cohort_size: NN,
    #                          action_dates: {action_date: num_actions}}}
    expected = defaultdict(lambda: {'cohort_size': 0,
                                    'action_dates': defaultdict(set)})

    for user_id in user_ids:
      weeks1, weeks2 = CohortTestCase.EMAIL_WEEKS[
        user_id % len(CohortTestCase.EMAIL_WEEKS)]
      week1 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks1)
      date1 = week1 + timedelta(hours=randint(0, 72))
      week2 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks2)
      date2 = week2 + timedelta(hours=randint(0, 72))
      week1_str = datetime_to_date_str(week1)
      week2_str = datetime_to_date_str(week2)
      user_dates[user_id] = ({'cohort': week1_str,
                              'cohort_date': week1,
                              'precise_date': date1},
                             {'cohort': week2_str,
                              'cohort_date': week2,
                              'precise_date': date2})
      expected[week1_str]['cohort_size'] += 1
      expected[week2_str]['cohort_size'] += 1
      self.kronos_client.put({
        CohortTestCase.EMAIL_STREAM: [
          {'user': user_id,
           constants.TIMESTAMP_FIELD: datetime_to_kronos_time(date1)},
          {'user': user_id,
           constants.TIMESTAMP_FIELD: datetime_to_kronos_time(date2)}
        ]
      })

    # Action stream: Users in group 1 hit the front page w/ 1/5
    # percent chance, group 2 with 2/5 chance, etc. Likelihood for
    # all users on day N is 1/N.
    for user_id in user_ids:
      day_to_min_action_dt = defaultdict(dict)
      for email_dates in user_dates[user_id]:
        for day in xrange(CohortTestCase.ACTION_REPETITION_DAYS):
          group_probability = (
            ((user_id % len(CohortTestCase.EMAIL_WEEKS)) + 1.0) /
            len(CohortTestCase.EMAIL_WEEKS))
          day_probability = 1.0 / (day + 1)
          action_probability = group_probability * day_probability
          if random() < action_probability:
            action_dt = email_dates['precise_date'] + timedelta(days=day)
            # This is the date that our cohort plan compares to.
            day_to_min_action_dt[action_dt.date()] = min(
              action_dt,
              day_to_min_action_dt.get(action_dt.date(),
                                       CohortTestCase.MAX_DT)
            )
            self.kronos_client.put({
              CohortTestCase.FRONTPAGE_STREAM: [
                {'user_id': user_id,
                 '@time': datetime_to_kronos_time(action_dt)}]
            })
            action_compare_dt = day_to_min_action_dt[action_dt.date()]
            for _email_dates in user_dates[user_id]:
              if (_email_dates['precise_date'] > action_compare_dt or
                  _email_dates['precise_date'] +
                  timedelta(CohortTestCase.ACTION_REPETITION_DAYS) <=
                  action_compare_dt):
                continue
              day = (action_compare_dt - _email_dates['precise_date']).days
              # Each user must be counted only once!
              expected[_email_dates['cohort']]['action_dates'][day].add(
                user_id)

    for cohort_name in expected:
      action_dates = expected[cohort_name]['action_dates']
      days = action_dates.keys()
      for day in days:
        action_dates[day] = len(action_dates[day])

    return expected
Beispiel #5
0
    def generate_data(self):
        user_ids = range(CohortTestCase.NUM_USERS)

        # Email stream: Users in groups 1-5 get an email in weeks 1 and 3,
        # 2 and 4, 3 and 5, 4 and 1, 5 and 2, respectively.
        # See `EMAIL_WEEKS`.
        user_dates = {}

        # Fill in expected_output, which is of
        # the form: {cohort_date: {cohort_size: NN,
        #                          action_dates: {action_date: num_actions}}}
        expected = defaultdict(lambda: {
            'cohort_size': 0,
            'action_dates': defaultdict(set)
        })

        for user_id in user_ids:
            weeks1, weeks2 = CohortTestCase.EMAIL_WEEKS[user_id % len(
                CohortTestCase.EMAIL_WEEKS)]
            week1 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks1)
            date1 = week1 + timedelta(hours=randint(0, 72))
            week2 = CohortTestCase.START_DATETIME + timedelta(weeks=weeks2)
            date2 = week2 + timedelta(hours=randint(0, 72))
            week1_str = datetime_to_date_str(week1)
            week2_str = datetime_to_date_str(week2)
            user_dates[user_id] = ({
                'cohort': week1_str,
                'cohort_date': week1,
                'precise_date': date1
            }, {
                'cohort': week2_str,
                'cohort_date': week2,
                'precise_date': date2
            })
            expected[week1_str]['cohort_size'] += 1
            expected[week2_str]['cohort_size'] += 1
            self.kronos_client.put({
                CohortTestCase.EMAIL_STREAM: [{
                    'user':
                    user_id,
                    constants.TIMESTAMP_FIELD:
                    datetime_to_kronos_time(date1)
                }, {
                    'user':
                    user_id,
                    constants.TIMESTAMP_FIELD:
                    datetime_to_kronos_time(date2)
                }]
            })

        # Action stream: Users in group 1 hit the front page w/ 1/5
        # percent chance, group 2 with 2/5 chance, etc. Likelihood for
        # all users on day N is 1/N.
        for user_id in user_ids:
            day_to_min_action_dt = defaultdict(dict)
            for email_dates in user_dates[user_id]:
                for day in xrange(CohortTestCase.ACTION_REPETITION_DAYS):
                    group_probability = ((
                        (user_id % len(CohortTestCase.EMAIL_WEEKS)) + 1.0) /
                                         len(CohortTestCase.EMAIL_WEEKS))
                    day_probability = 1.0 / (day + 1)
                    action_probability = group_probability * day_probability
                    if random() < action_probability:
                        action_dt = email_dates['precise_date'] + timedelta(
                            days=day)
                        # This is the date that our cohort plan compares to.
                        day_to_min_action_dt[action_dt.date()] = min(
                            action_dt,
                            day_to_min_action_dt.get(action_dt.date(),
                                                     CohortTestCase.MAX_DT))
                        self.kronos_client.put({
                            CohortTestCase.FRONTPAGE_STREAM: [{
                                'user_id':
                                user_id,
                                '@time':
                                datetime_to_kronos_time(action_dt)
                            }]
                        })
                        action_compare_dt = day_to_min_action_dt[
                            action_dt.date()]
                        for _email_dates in user_dates[user_id]:
                            if (_email_dates['precise_date'] >
                                    action_compare_dt or
                                    _email_dates['precise_date'] + timedelta(
                                        CohortTestCase.ACTION_REPETITION_DAYS)
                                    <= action_compare_dt):
                                continue
                            day = (action_compare_dt -
                                   _email_dates['precise_date']).days
                            # Each user must be counted only once!
                            expected[_email_dates['cohort']]['action_dates'][
                                day].add(user_id)

        for cohort_name in expected:
            action_dates = expected[cohort_name]['action_dates']
            days = action_dates.keys()
            for day in days:
                action_dates[day] = len(action_dates[day])

        return expected
Beispiel #6
0
def cohort_queryplan(plan):
  """
  Input:
  {
   'kronos_url': 'http://...',
   'cohort':
    {'stream': CohortTest.EMAIL_STREAM, # Kronos stream to define cohort from.
     'transform': lambda x: x,          # Transformations on the kstream.
     'start': date.now(),               # The day of the first cohort.
     'unit': DateUnit.XX,               # Users are in the same cohort
                                        # if they are in the same day/week.
     'cohorts': 5                       # How many cohorts (days/weeks/months)
                                        # to track.
     'grouping_key': 'user'},           # What key in an event should we tie
                                        # to a key in the action stream?

   'action':
     {'stream': CohortTest.FRONTPAGE_STREAM, # Stream users take actions on.
      'transform': lambda x: x               # Transformations on the stream.
      'unit': DateUnit.XX,                   # Track events in day/week/months.
      'repetitions': 14                   # How many days/weeks/months to track.
      'grouping_key': 'user_id'}          # What key in an event should we tie
                                          # to a key in the action stream?
  }

  Output:
  A metis-compatible query plan to return a cohort analysis.
  """
  cohort = plan['cohort']
  action = plan['action']
  kronos_url = plan.get('kronos_url', app.config['KRONOS_SERVER'])

  # Calculate the start and end dates, in Kronos time, of the
  # beginning and end of the cohort and action streams that will be
  # relevant.
  cohort_start = datetime_to_kronos_time(_date_to_datetime(cohort['start']))
  cohort_span = timedelta(**{cohort['unit']: cohort['cohorts']})
  cohort_end = cohort['start'] + cohort_span
  action_span = timedelta(**{action['unit']: action['repetitions']})
  action_end = cohort_end + action_span
  cohort_end = datetime_to_kronos_time(_date_to_datetime(cohort_end)) + 1
  action_end = datetime_to_kronos_time(_date_to_datetime(action_end)) + 1

  left = _cohort_stream_transform(kronos_url,
                                  cohort['stream'], cohort_start, cohort_end,
                                  cohort.get('transform'),
                                  cohort['grouping_key'], cohort['unit'])
  right = _cohort_stream_transform(kronos_url,
                                   action['stream'], cohort_start, action_end,
                                   action.get('transform'),
                                   action['grouping_key'], action['unit'])

  additional_action_time = (DateUnit.unit_to_kronos_time(action['unit']) *
                            action['repetitions'])

  joined = join(
    left,
    right,
    cond_and(cond(p('cohort.%s' % cohort['grouping_key']),
                  p('action.%s' % action['grouping_key']),
                  ConditionOpType.EQ),
             cond(p('action.%s' % TIMESTAMP_FIELD),
                  p('cohort.%s' % TIMESTAMP_FIELD),
                  ConditionOpType.GTE),
             cond(p('action.%s' % TIMESTAMP_FIELD),
                  f(FunctionType.ADD,
                    [p('cohort.%s' % TIMESTAMP_FIELD),
                     c(additional_action_time)]),
                  ConditionOpType.LT)),
    left_alias='cohort',
    right_alias='action')

  user_aggregated = agg(
    joined,
    {TIMESTAMP_FIELD: p('cohort.date'),
     'group': p('cohort.%s' % cohort['grouping_key']),
     'action_step': f(FunctionType.FLOOR,
                      [f(FunctionType.SUBTRACT,
                         [p('action.%s' % TIMESTAMP_FIELD),
                          p('cohort.%s' % TIMESTAMP_FIELD)]),
                       c(DateUnit.unit_to_kronos_time(action['unit']))])},
    []
    )

  aggregated = agg(
    user_aggregated,
    {TIMESTAMP_FIELD: p(TIMESTAMP_FIELD),
     'action_step': p('action_step')},
    [agg_op(AggregateType.COUNT, [], alias='cohort_actions')]
    )

  # TODO(marcua): Also sum up the cohort sizes, join with the plan.
  return aggregated