Beispiel #1
0
def _generate_daily_hour_buckets(from_date, to_date):
    """ Generate a list of hour level timeslots, for each day from the interval. """
    timeslot_ranges = []
    day_timeslots = gen_timeslots(from_date, to_date, level='day')
    for ts in day_timeslots:
        from_date_d = timeslot_to_datetime(ts)
        to_date_d = from_date_d.replace(hour=23, minute=59)
        hourly_levels = list(
            gen_timeslots(from_date_d, to_date_d, level='hour'))
        timeslot_ranges.append(hourly_levels)
    return timeslot_ranges
Beispiel #2
0
def aggregate_stats(user,
                    channel,
                    from_,
                    to_,
                    level,
                    stats=('volume', 'latency')):
    data = {}
    for a in stats:
        data[a] = []

    by_ts = {}

    for stat in ServiceChannelStats.objects.by_time_span(user,
                                                         channel,
                                                         start_time=from_,
                                                         end_time=to_,
                                                         level=level):
        by_ts[stat.time_slot] = stat

    counts = defaultdict(int)
    for slot in gen_timeslots(from_, to_, level):
        for stat in stats:
            stat_obj = by_ts.get(slot, None)
            if stat_obj:
                value = getattr(
                    stat_obj, 'average_latency' if stat == 'latency' else stat)
            else:
                value = 0

            data[stat].append([timeslot_to_timestamp_ms(slot), value])
            counts[stat] += value

    return data, counts
Beispiel #3
0
    def get_time_data(groups, y_axis):
        total_counts = defaultdict(int)
        total_items = defaultdict(int)
        data = defaultdict(list)

        for slot in gen_timeslots(from_ts, to_ts):
            timestamp = timeslot_to_timestamp_ms(slot)
            features_data = groups.get(slot, {})

            for feature in y_axis:
                feature_key = get_feature_key(feature)

                if features_data.get(feature_key):
                    count = _get_count(features_data[feature_key])

                    total_counts[feature_key] += count
                    total_items[feature_key] += 1
                    data[feature_key].append([timestamp, count])
                else:
                    data[feature_key].append([timestamp, 0])

        if plot_type == 'response-time':
            # return average as result
            result_counts = defaultdict(float)
            for key, value in total_counts.iteritems():
                if total_items.get(key):
                    result_counts[key] = round(value / total_items[key], 2)
                else:
                    result_counts[key] = 0
        else:
            result_counts = total_counts
        return data, result_counts
Beispiel #4
0
 def _get_data(int_id):
     data = []
     for slot in gen_timeslots(from_dt, to_dt, level):
         timestamp = timeslot_to_timestamp_ms(slot)
         count = ts_counts.get(slot, 0)
         data.append((timestamp, count))
     return data
Beispiel #5
0
def _get_performance_stats(user, channel, from_, to_, level, stats_type):
    """ Return list of items for Performance stats graph

    """

    if not isinstance(stats_type, list):
        raise RuntimeError('stats_type should be an array')

    result = []
    for stype in stats_type:
        if stype not in [
                'number_of_posts', 'number_of_actionable_posts',
                'number_of_impressions', 'number_of_clicks',
                'number_of_rejected_posts'
        ]:
            raise RuntimeError("unsupported stats_type %s" % stype)

        values = _get_channel_stats_values(user, channel, from_, to_, level,
                                           stype)
        data = []
        count = 0
        for slot in gen_timeslots(from_, to_, level):
            value = values.get(slot, 0)
            data.append([timeslot_to_timestamp_ms(slot), value])
            count += value
        result.append(dict(data=data, label=stype.split("_")[2], count=count))

    return jsonify(ok=True, list=result, level=level)
Beispiel #6
0
def purge_days(channel):
    '''
    From now, purge days that we want to maintain in our history, that have not been
    purged yet.
    '''

    # for all the days in the intersection between [last_purged, today], [3 days ago, today]
    if channel.last_purged:
        range_start = utc(channel.last_purged)
    else:
        range_start = now() - relativedelta(days=14)

    days_to_purge = list(gen_timeslots(range_start, now(), level='day'))

    trend_stats = [0, 0, 0]
    topic_stats = [0, 0, 0]

    for day in days_to_purge:
        topic_res = mark_and_sweep_topics(channel, day)
        topic_stats = [x + y for x, y in zip(topic_stats, topic_res)]
        #LOGGER.debug("TOPIC STATS: %s", topic_res)
        trend_res = purge_corresponding_trends(channel=channel, timeslot=day)
        trend_stats = [x + y for x, y in zip(trend_stats, trend_res)]

    return days_to_purge, topic_stats, trend_stats
Beispiel #7
0
def purge_months(channel):
    '''
    From now, purge months that we want to maintain in our history, that have not been
    purged yet.
    '''
    if channel.last_purged:
        range_start = utc(channel.last_purged)
    else:
        range_start = now() - relativedelta(months=2)

    mday = localtime().tm_mday

    if mday > 7:
        range_end = now()
    else:
        range_end = now() - relativedelta(months=1)

    months_to_purge = []

    trend_stats = [0, 0, 0]
    topic_stats = [0, 0, 0]
    if range_start <= range_end:
        months_to_purge = list(
            gen_timeslots(range_start, range_end, level='month'))

        for month in months_to_purge:
            topic_res = mark_and_sweep_topics(channel, month)
            topic_stats = [x + y for x, y in zip(topic_stats, topic_res)]
            #LOGGER.debug("TOPIC STATS: %s", topic_res)
            trend_res = purge_corresponding_trends(channel=channel,
                                                   timeslot=month)
            trend_stats = [x + y for x, y in zip(trend_stats, trend_res)]

    return months_to_purge, topic_stats, trend_stats
Beispiel #8
0
def compute_customer_timeline(customer, from_dt, to_dt):
    def _get_platform(event):
        platform = event._t[0]
        if platform.endswith('Post') and platform != 'Post':
            platform = platform[:-len('Post')]
        return platform

    timeline_data = []
    for monthly_slot in reversed(
            list(timeslot.gen_timeslots(from_dt, to_dt, 'month'))):
        _month_start, _month_end = timeslot.Timeslot(monthly_slot).interval
        _month_events_count = Event.objects.range_query_count(
            from_dt, to_dt, customer)

        if not _month_events_count:
            continue

        if _month_start.month == to_dt.month:
            month_label = 'This Month'
        elif _month_start.month == to_dt.month - 1:
            month_label = 'Last Month'
        else:
            month_label = _month_start.strftime('%B')

        timeline_data.append([month_label, []])

        for daily_slot in reversed(
                list(timeslot.gen_timeslots(from_dt, to_dt, 'day'))):
            _day_start, _day_end = timeslot.Timeslot(daily_slot).interval
            _day_events = list(
                Event.objects.range_query(max(utc(from_dt), _day_start),
                                          min(utc(to_dt), _day_end), customer))

            if not _day_events:
                continue

            day_label = _day_start.strftime('%b %d')
            timeline_data[-1][-1].append([day_label, []])

            grouper = itertools.groupby(_day_events, _get_platform)
            for platform, platform_events in grouper:
                _events = list(platform_events)
                event_interval_ids = (str(_events[0].id), str(_events[-1].id))
                timeline_data[-1][-1][-1][-1].append(
                    (platform, len(_events), event_interval_ids))
    return customer, timeline_data
Beispiel #9
0
def _generate_day_level_ranges(from_date, to_date):
    """ Generate a bunch of [from-date, to-date] ranges for each month in the interval. """
    timeslot_ranges = []
    month_timeslots = gen_timeslots(from_date, to_date, level='month')
    for ts in month_timeslots:
        from_date_m = timeslot_to_datetime(ts)
        timeslot_ranges.append(_get_month_day_range(from_date_m))
    return timeslot_ranges
Beispiel #10
0
    def by_time_span(self, channel=None, parent_topic=None, intentions=None, statuses=None,
                     agents=None, languages=None, from_ts=None, to_ts=None, limit=100):
        # Use the aggregation framework to resolve the counts:
        # match on channel + slot + hashed_parents [+ status [+ intention_type ]]
        # group on topic, sum(leaf or node count?)
        # sort(count, -1)
        # limit(100)
        F = ChannelHotTopics.F

        from_ts = Timeslot(from_ts).timeslot
        to_ts   = Timeslot(to_ts or from_ts).timeslot

        time_range = list(gen_timeslots(from_ts, to_ts, closed_range=False))
        assert len(time_range) <= 7, "Max allowed range is 7 days, got %s %s" % (len(time_range), time_range)

        if len(time_range) == 1:
            time_query = {F("time_slot"): time_range[0]}
        else:
            time_query = {F("time_slot"): {"$in": time_range}}

        channel_num = get_channel_num(channel)
        if parent_topic is None:
            parents = []
        else:
            parents = get_topic_hash(parent_topic)

        intention_ids = set(intentions or [ALL_INTENTIONS_ID])
        intention_ids = map(get_intention_id, intention_ids)

        statuses = set(statuses or SpeechActMap.STATUS_NAME_MAP)
        statuses = map(get_status_code, statuses)
        languages = map(get_lang_id, languages or [])

        match_query_base = {
            F("channel_num")    : channel_num,
            F("status")         : {"$in" : statuses},
            F("hashed_parents") : parents,
        }
        match_query_base.update(time_query)

        agent_ids = [a.agent_id for a in (agents or [])] or [ALL_AGENTS]

        match_query_filters = {
            "es.at": {"$in": agent_ids},
            "es.in": {"$in": intention_ids}
        }
        match_query_filters.update(make_lang_query(languages))

        return self.execute_pipeline(match_query_base, match_query_filters, limit)
Beispiel #11
0
    def _get_data(from_dt, to_dt, level, pairs, stat_type):
        count = len(pairs)

        date_counts = defaultdict(int)
        total = 0
        for p in pairs:
            #p[0] - time slot
            #p[1] - increment
            date_counts[p[0]] += p[1]
            total += p[1]

        data = []
        for slot in gen_timeslots(from_dt, to_dt, level):
            js_time_stamp = timeslot_to_timestamp_ms(slot)
            data.append((js_time_stamp, date_counts[slot]))

        if stat_type == 'clicks':
            count = total

        return count, data
Beispiel #12
0
    def get_time_data(self, groups, y_axis):
        """ Return data formated in a FLOT specific format; eg. [[time, count], [time, count]]
        so that we can use it for time plots """
        real_counts = defaultdict(int)
        # We need to actually count the response volume across this data, not timeslots
        # for an accurate average over response time
        for feature in y_axis:
            feature_key = self.get_feature_key(feature)
            for _, value in groups.iteritems():
                if feature_key in value:
                    real_counts[feature_key] += value[feature_key].get('rv', 0)

        total_counts = defaultdict(int)
        total_items = defaultdict(int)
        data = defaultdict(list)

        for slot in gen_timeslots(self.from_ts, self.to_ts):
            timestamp = timeslot_to_timestamp_ms(slot)
            features_data = groups.get(slot, {})
            for feature in y_axis:
                feature_key = self.get_feature_key(feature)
                if features_data.get(feature_key):
                    count = features_data[feature_key].get('count', 0)
                    total_counts[
                        feature_key] += count * features_data[feature_key].get(
                            'rv', 1)
                    total_items[feature_key] += 1
                    data[feature_key].append([timestamp, count])
                else:
                    data[feature_key].append([timestamp, 0])

        result_counts = defaultdict(float)
        for key, value in total_counts.iteritems():
            if total_items.get(key):
                if real_counts[key]:
                    result_counts[key] = round(value / real_counts[key], 2)
                else:
                    result_counts[key] = 0
            else:
                result_counts[key] = 0
        return data, result_counts, total_items
Beispiel #13
0
    def get_time_data(self, groups, y_axis):
        """ Return data formated in a FLOT specific format; eg. [[time, count], [time, count]]
        so that we can use it for time plots """
        total_counts = defaultdict(int)
        total_items = defaultdict(int)
        data = defaultdict(list)

        for slot in gen_timeslots(self.from_ts, self.to_ts):
            timestamp = timeslot_to_timestamp_ms(slot)
            features_data = groups.get(slot, {})
            for feature in y_axis:
                feature_key = self.get_feature_key(feature)
                if features_data.get(feature_key):
                    count = features_data[feature_key].get('count', 0)
                    total_counts[feature_key] += count
                    total_items[feature_key] += 1
                    data[feature_key].append([timestamp, count])
                else:
                    data[feature_key].append([timestamp, 0])

        return data, total_counts, total_items
Beispiel #14
0
def purge_corresponding_trends(channel, timeslot):
    ts_date, ts_level = decode_timeslot(timeslot)
    sub_level = {"month": "day", "day": "hour"}[ts_level]
    range_start = ts_date

    if "month" == ts_level:
        range_end = ts_date + relativedelta(months=1)
    else:
        range_end = ts_date + relativedelta(days=1)

    timeslots_to_purge = list(
        gen_timeslots(range_start, range_end, level=sub_level))[:-1]
    topics = trends_find_topics(timeslot, channel)
    trend_stats = [0, 0, 0]

    total_number = len(timeslots_to_purge)
    for i, ts in enumerate(timeslots_to_purge):
        LOGGER.info(
            'timeslot info: channel: %s; current timeslot "%s"; %sth timeslot of %s timeslots',
            channel.title, decode_timeslot(ts), i, total_number)
        trend_res = mark_and_sweep_trends(channel, ts, topics)
        trend_stats = [x + y for x, y in zip(trend_stats, trend_res)]
    return tuple(trend_stats)
Beispiel #15
0
    except Channel.DoesNotExist, e:
        return jsonify(ok=False, error=str(e))

    from_dt, to_dt = parse_date_interval(data['from'], data['to'])
    #level          = guess_timeslot_level(from_dt, to_dt)
    #print from_dt, to_dt, level

    intention_type_ids = [ SATYPE_NAME_TO_ID_MAP[intention]
                           for intention in data['intentions'] ]

    intention_types = defaultdict(int)
    for speech_act in SpeechActMap.objects.find_by_user(
        user,
        channels__in          = [channel.id],
        intention_type_id__in = intention_type_ids,
        time_slot__in         = list(gen_timeslots(from_dt, to_dt, 'hour'))
    ):
        intention_types[speech_act.intention_type_id] += 1

    res = []
    for (intention_type_id, count) in intention_types.items():
        res.append({
            'label': SATYPE_ID_TO_NAME_MAP[str(intention_type_id)],
             'data': count})

    return jsonify(ok=True, list=res)


@app.route('/performance/trends/json', methods=['POST'])
@login_required()
def performance_trends(user):
Beispiel #16
0
def compute_account_stats(account,
                          idx,
                          from_date,
                          to_date,
                          levels=('hour', 'day'),
                          output_stream=None,
                          raise_on_diffs=False,
                          test_mode=True,
                          ignore_purging=False,
                          ignore_topics=False):
    from solariat_bottle.db.channel.base import Channel
    from solariat_bottle.db.post.utils import get_platform_class
    from solariat_bottle.db.speech_act import SpeechActMap
    from solariat.utils.timeslot import gen_timeslots

    start_processing = datetime.now()
    all_channels = Channel.objects.find(account=account)[:]
    all_channels = [c for c in all_channels if not c.is_service]

    if not all_channels: return
    Post = get_platform_class(all_channels[0].platform)
    computed_months = list(gen_timeslots(from_date, to_date, level='month'))

    base_channels = [c for c in all_channels if not c.is_smart_tag]
    timeslot_ranges = _generate_daily_hour_buckets(from_date, to_date)
    post_count = 0
    for day_timeslot in timeslot_ranges:
        # Since we don't want to get the posts for every channel, but we also want to keep
        # post batches small so we don't overflow memory, then we need to do partial upserts
        # on a hourly timeslot basis on day level basis trends. In order to do this we need
        # to keep track of which stats were partially computed so we increment values instead
        # of just removing + batch inserting
        partial_updates = {}
        channel_trends_caches = {}
        for channel in all_channels:
            partial_updates[channel.id] = {'ctt': set([]), 'cht': set([])}
            channel_trends_caches[channel.id] = {}

        day_speech_act_filter = _compute_sam_match_query(
            base_channels, day_timeslot[0], day_timeslot[-1])
        post_ids = [
            sa['pt']
            for sa in SpeechActMap.objects.coll.find(day_speech_act_filter)
        ]

        if len(post_ids) > MAX_BATCH_SIZE:
            ## This is really in case of very high load channels. Go in batches of maximum
            ## MAX_BATCH_SIZE so we don't lock mongo connection
            sams_batches = int(ceil(len(post_ids) / float(MAX_BATCH_SIZE)))
        else:
            sams_batches = 1
        for sams_batch_idx in xrange(sams_batches):
            # Go in MAX_BATCH_SIZE increments through the posts
            from_idx = sams_batch_idx * MAX_BATCH_SIZE
            to_idx = (sams_batch_idx + 1) * MAX_BATCH_SIZE
            posts = Post.objects.find(id__in=post_ids[from_idx:to_idx])[:]
            for channel in all_channels:
                # For now always ignore purging, it's a huge performance leak
                if not ignore_topics:
                    chts_cache, ctts_cache = _process_channel(
                        channel, posts, computed_months, True, levels,
                        channel_trends_caches[channel.id])
                    # Now do the partial updates on hourly level stats
                    _upsert_channel_topic_trends(
                        ctts_cache,
                        partial_keys=partial_updates[channel.id]['ctt'])
                    _upsert_channel_hot_topics(
                        chts_cache,
                        partial_keys=partial_updates[channel.id]['cht'])
                else:
                    _process_channel(channel, posts, computed_months, True,
                                     levels, channel_trends_caches[channel.id])
            post_count += len(posts)

        logger.info("Finished processing %s posts in %s " %
                    (post_count, datetime.now() - start_processing))
        _memory_usage_psutil()

        _upsert_channel_trends(channel_trends_caches)

    for channel in all_channels:
        days_to_purge = list(gen_timeslots(from_date, to_date, level='day'))
        top_topics = set([])
        for time_slot in days_to_purge:
            _get_top_topics(channel, time_slot, top_topics, 0)
        if not ignore_topics:
            _update_monthly_cht_values(channel, from_date, to_date, top_topics)

    logger.info(
        "Computed stats computations for account %s with post count %s in %s."
        % (account.name, post_count, datetime.now() - start_processing))