Ejemplo n.º 1
0
def compute_sam_match_query(channel):
    """ Compute the query which would match all speech acts for the given
    channels list, in the timeslot interval (from - to)"""
    from solariat.db import fields
    from solariat_bottle.utils.id_encoder import BIGGEST_POST_VALUE, TIMESLOT_WIDTH
    from solariat_bottle.db.speech_act import SpeechActMap, pack_speech_act_map_id

    from_timeslot = 0 << TIMESLOT_WIDTH
    to_timeslot = (1L << TIMESLOT_WIDTH) - 1

    to_binary = fields.BytesField().to_mongo
    match_query_base = []
    for status in SpeechActMap.STATUS_NAME_MAP.keys():
        # compute id bounds for all posts for this slot
        id_lower_bound = pack_speech_act_map_id(channel, status, from_timeslot,
                                                0)
        id_upper_bound = pack_speech_act_map_id(channel, status, to_timeslot,
                                                BIGGEST_POST_VALUE)
        match_query_base.append({
            '_id': {
                "$gte": to_binary(id_lower_bound),
                "$lte": to_binary(id_upper_bound)
            }
        })

    day_speech_act_filter = {"$or": match_query_base}
    return day_speech_act_filter
Ejemplo n.º 2
0
    def make_id(cls, channel, status, time_slot):
        channel_num = channel.counter

        assert isinstance(
            channel_num, (int, long)), \
           'channel.counter must be an integer: %r' % channel_num # noqa
        to_binary = fields.BytesField().to_mongo
        return to_binary(pack_short_stats_id(channel_num, status, time_slot))
Ejemplo n.º 3
0
class StreamLog(Document):
    """Created on streamref creation, updated on stream stops"""
    accounts = fields.ListField(fields.ObjectIdField())
    channels = fields.ListField(fields.ObjectIdField())

    stream_ref_id = fields.BytesField()

    started_at = fields.DateTimeField(null=True)
    stopped_at = fields.DateTimeField(null=True)

    indexes = [('accounts', ), ('channels', ), ('stream_ref_id', )]
Ejemplo n.º 4
0
class StreamRef(Document):
    QUEUED = 'queued'
    RUNNING = 'running'
    ERROR = 'error'
    STOPPED = 'stopped'
    STREAM_STATUSES = [QUEUED, RUNNING, ERROR, STOPPED]

    id = fields.BytesField(db_field='_id', unique=True, required=True)
    track = fields.ListField(fields.StringField())
    follow = fields.ListField(fields.StringField())  # user_ids
    languages = fields.ListField(fields.StringField(), db_field='lng')

    status = fields.StringField(choices=STREAM_STATUSES)
    log = fields.ReferenceField('StreamLog')

    manager = StreamRefManager
    indexes = [('status', )]

    def is_stopped(self):
        return self.status == self.STOPPED or (self.log and self.log.stopped_at
                                               is not None)

    @property
    def key(self):
        if not self.id:
            footprint = self.filters
            self.id = mhash(footprint, n=128)
        return self.id

    @property
    def filters(self):
        return tuple(
            [freeze(self.track),
             freeze(self.follow),
             freeze(self.languages)])

    def set_added(self):
        self.update(status=self.RUNNING)
        self.log.update(started_at=now())

    def set_removed(self):
        self.update(status=self.STOPPED)
        self.log.update(stopped_at=now())

    def save(self, **kw):
        self.id = self.key  # fill hash id
        super(StreamRef, self).save(**kw)
Ejemplo n.º 5
0
from solariat.db import fields

from solariat_bottle.settings import LOGGER
from solariat_bottle.db.channel_trends import ChannelTrendsManager
from solariat_bottle.db.channel_stats_base import ChannelTrendsBase, EmbeddedStats
from solariat_bottle.utils.id_encoder import pack_conversation_stats_id, unpack_conversation_stats_id
from solariat_bottle.db.channel_stats_base import ALL_AGENTS, CountDict

to_binary = fields.BytesField().to_mongo


class ConversationEmbeddedStats(EmbeddedStats):

    # Metrics
    count = fields.NumField(db_field='cn', default=0)

    countable_keys = ['count']

    # def __hash__(self):
    #     return hash(self.agent)

    def __str__(self):
        return "|agent=%s;count=%s|" % (self.agent, self.count)


class ConversationTrends(ChannelTrendsBase):
    """ Base class for all conversation trends.
    has allow_inheritance set to True
    """
    manager = ChannelTrendsManager
Ejemplo n.º 6
0
class ChannelStatsBase(Document):
    """
    Base class for trend and topic stats.
    """
    version = fields.NumField(db_field='_v')

    id = fields.BytesField(db_field='_id', unique=True, required=True)
    time_slot = fields.NumField(default=0, db_field='ts')
    gc_counter = fields.NumField(db_field='g')
    channel_ts = fields.BytesField(db_field='ct')

    indexes = [('gc_counter')]

    def channel_ts_from_id(self, data_id):
        """ From a document id compute a channel ts """
        channel_num, _, _, time_slot = unpack_stats_id(to_python(data_id))
        return self.make_channel_ts(channel=channel_num, time_slot=time_slot)

    @classmethod
    def make_channel_ts(cls, channel, time_slot):
        channel_num = get_channel_num(channel)
        res = pack_components(
            (channel_num, CHANNEL_WIDTH),
            (time_slot, TIMESLOT_WIDTH),
        )
        return res

    @property
    def EmbeddedStatsCls(self):
        return self.fields['embedded_stats'].field.doc_class

    @property
    def _query(self):
        raise AppException(
            'unimplemented method, to be overrided in a subclass')

    def prepare_update_query(self, item_id, item_topic):
        """
        Genereate the update query for all the embedded stats. Also return
        the item version so that we can do optimistic locking on stats update.
        This is needed because we are setting new embedded stats every time.
        """
        item = self.objects.find_one(id=item_id)

        if item:
            # In case of hash collision, this should make sure we are not retrying again
            # and again.
            # item_topic can be None for simple trends
            if not item_topic is None:
                assert item.topic == item_topic, u"Collision '%s' '%s'" % (
                    item.topic, item_topic)
            version = item.version if item.version else DEFAULT_NEW_VERSION
            existing_embedded_stats = item.embedded_stats
        else:
            version = DEFAULT_NEW_VERSION
            existing_embedded_stats = []

        new_embedded_stats = self.compute_new_embeded_stats()
        # Generate and updated list based on in memory entries and
        # the existing entries from the database.
        updated_list = self.EmbeddedStatsCls.update_list(
            existing_embedded_stats, new_embedded_stats)
        self._upsert_data = {
            "$set": {
                self.name2db_field('embedded_stats'): updated_list
            }
        }
        return version

    def upsert(self, w=1):
        # Try 5 times just to make it safe for conflicts.
        if self.stats_upsert(max_tries=5):
            return True
        return False

    def get_expected_topic(self, query):
        """ In the simple reports we expect no topic. """
        return None

    def stats_upsert(self, max_tries=4, logger=LOGGER):
        """Used in upsert() method for documents with embedded stats list.

        Returns True if document has been successfully saved within `max_tries` iterations,
        else False.
        """
        _v = self.name2db_field('version')
        find_query = self._query

        # remove 'gc_counter' if exists
        find_query.pop(self.name2db_field('gc_counter'), None)
        # simple trends do not have topics
        item_topic = self.get_expected_topic(find_query)
        item_id = find_query["_id"]

        nr_of_tries = 0
        while nr_of_tries < max_tries:
            nr_of_tries += 1
            try:
                version = self.prepare_update_query(item_id, item_topic)
            except AssertionError, e:
                logger.warning(
                    u"Topic hashing collision. Stats not updated! \nfind query=%s\nitem topic=%s\n%s"
                    % (find_query, item_topic, e))
                return False

            # Increment version using $set to be more robust to new documents
            self._upsert_data["$set"][_v] = version + 1
            if version > DEFAULT_NEW_VERSION:
                # If it's an update, just look by id and version, nothing else really matters
                find_query = {_v: version, "_id": find_query["_id"]}
            else:
                # On new documents, set the default version and use whole find query so upsert
                # generates a document with whole data
                find_query[_v] = version

            try:
                assert '_id' in find_query, 'unique id required'
                assert '_v' in find_query, 'version required'
                self.objects.coll.update(find_query,
                                         self._upsert_data,
                                         upsert=True,
                                         w=1)
                return True
            except AssertionError, e:
                logger.error(
                    u"Find query needs at very least _id and _v. Instead got: %s"
                    % str(find_query))
                return False
            except DuplicateKeyError, e:
                # This is just part of our optimistic lock and can fail a lot especially for high
                # traffic channels, so we should not consider it an error since it just makes actual
                # error tracking in logs harder to do.
                if 2 <= nr_of_tries <= 3:
                    # We already tried 2 or 3 times, it might be an actual problem
                    LOGGER.warning(
                        "channel stats locking: collision %s times in a row. id=%r",
                        nr_of_tries, find_query['_id'])
                elif nr_of_tries >= 4:
                    # We already tried 4 times, something is definitely wrong
                    LOGGER.error(
                        u"channel stats locking: collision repeated %s times. Find query=%s, Upsert=%s"
                        % (nr_of_tries, find_query, self._upsert_data))
                # If we just had an optimistic lock fail, sleep for a random period
                # until trying again.
                delay_sec = max(0.01, normalvariate(0.1, 0.03))
                LOGGER.debug(
                    'channel stats locking: waiting for %.2f sec after a collision',
                    delay_sec)
                sleep(delay_sec)
Ejemplo n.º 7
0
from solariat.db.abstract import SonDocument, Document
from solariat.utils.lang.support import Lang, get_lang_code

from solariat_nlp.sa_labels import ALL_INTENTIONS, SATYPE_ID_TO_NAME_MAP
from solariat_bottle.settings import LOGGER, AppException, get_var
from solariat_bottle.db.channel.base import Channel
from solariat_bottle.utils.id_encoder import (pack_stats_id, unpack_stats_id,
                                              pack_components, get_channel_num,
                                              CHANNEL_WIDTH, TIMESLOT_WIDTH)

ALL_AGENTS = 0
ANONYMOUS_AGENT_ID = -1
DEFAULT_NEW_VERSION = 0
ALL_INTENTIONS_INT = int(ALL_INTENTIONS.oid)

to_python = fields.BytesField().to_python
to_mongo = fields.BytesField().to_mongo
to_binary = to_mongo


def conversation_closed(conversation, closing_time, quality):
    from solariat_bottle.tasks.stats import update_conversation_stats
    if get_var('ON_TEST'):
        update_conversation_stats(conversation, closing_time, quality)
    else:
        update_conversation_stats.ignore(conversation, closing_time, quality)


def post_created(post, **context):
    # Avoid circular imports. TODO: We should have a clearer package dep chain.
    from solariat_bottle.utils.stats import _update_channel_stats
Ejemplo n.º 8
0
class DataExport(ArchivingAuthDocument):

    STATES = dict(CREATED=0,
                  FETCHING=1,
                  FETCHED=2,
                  GENERATING=3,
                  GENERATED=4,
                  SENDING=5,
                  SENT=6,
                  SUCCESS=7,
                  ERROR=8,
                  CANCELLED=9)
    State = enum(**STATES)

    account = fields.ReferenceField('Account', db_field='at')
    created_by = fields.ReferenceField('User', db_field='ur')
    recipients = fields.ListField(fields.ReferenceField('User'), db_field='rs')
    recipient_emails = fields.ListField(fields.StringField(), db_field='rse')
    state = fields.NumField(choices=STATES.values(),
                            default=State.CREATED,
                            db_field='se')
    created_at = fields.DateTimeField(db_field='ct', default=now)
    _input_filter = fields.DictField(db_field='ir')
    input_filter_hash = fields.BytesField(db_field='irh')
    states_log = fields.ListField(fields.DictField(), db_field='sg')

    indexes = [('acl', 'input_filter_hash')]
    manager = DataExportManager

    def set_input_filter(self, data):
        self._input_filter = data
        self.input_filter_hash = hash_dict(data)

    input_filter = property(lambda self: self._input_filter, set_input_filter)

    def _log_state_change(self, from_state, to_state, extra_info):
        doc = {"from": from_state, "to": to_state, "ts": now()}
        if extra_info:
            doc["info"] = extra_info
        self.states_log.append(doc)
        return {"push__states_log": doc}

    def change_state(self, new_state, **kwargs):
        current_state = self.state
        assert \
            new_state in {self.State.ERROR, self.State.CANCELLED} \
            or new_state - current_state <= 2, \
            "Cannot switch to state %s from state %s" % (
                new_state, current_state)

        self.state = new_state
        update_dict = self._log_state_change(current_state, new_state, kwargs)
        update_dict.update(set__state=new_state)
        self.update(**update_dict)

    def to_json(self, fields_to_show=None):
        data = super(DataExport,
                     self).to_json(fields_to_show=('id', 'input_filter_hash',
                                                   'state', 'created_at'))
        data['input_filter_hash'] = str(data['input_filter_hash'])
        return data

    def process(self, user, params=None):
        state = self.change_state
        S = DataExport.State
        initial_args = user, params

        pipeline = [(S.FETCHING, fetch_posts),
                    (S.GENERATING, PostsCsvGenerator.generate_csv),
                    (None, create_zip_attachments),
                    (S.SENDING, DataExportMailer(self).send_email)]

        try:
            args = initial_args
            for step, command in pipeline:
                step and state(step)
                result = command(*args)
                if not isinstance(result, tuple):
                    args = (result, )
                else:
                    args = result

            state(S.SUCCESS)
        except Exception as exc:
            state(S.ERROR, exception=unicode(exc))
            raise exc