Ejemplo n.º 1
0
        def to_dict(self, include_cardinalities=False, fields2show=None, **kw):
            res = super(SchemaBased, self).to_dict(fields2show)
            res['is_locked'] = self.is_locked

            if not include_cardinalities:
                res.pop('cardinalities', None)
                return res

            cardinalities = res['cardinalities'] or {}
            for key, val in cardinalities.iteritems():
                if key not in self.schema_field_types:
                    continue
                if self.schema_field_types[key] != TYPE_TIMESTAMP:
                    continue
                if VALUES not in val:
                    continue

                LOGGER.info('Refreshing cardinalities for %s', str(val))
                val[VALUES] = [
                    dt.strftime('%m/%d/%Y %H:%M:%S') if isinstance(
                        dt, datetime.date) else dt
                    for dt in val.get(VALUES, [])
                ]

            return res
Ejemplo n.º 2
0
def purge_outdated_trends_stats(coll, channel, level, delta):
    initial_timedelta_arg_name = {"hour": "days", "day": "months"}[level]
    timedelta_arg_name = {"hour": "hours", "day": "days"}[level]
    start_dt = now() - relativedelta(**{initial_timedelta_arg_name: delta})
    current_dt = start_dt
    time_step = relativedelta(**{timedelta_arg_name: 1})
    ts = datetime_to_timeslot(current_dt, level)
    zero_counts = 0
    total_records_removed = 0
    EMPTY_SLOTS_NUMBER = 10
    while zero_counts <= EMPTY_SLOTS_NUMBER:
        t0 = datetime.now()
        channel_ts_val = ChannelTopicTrends.make_channel_ts(channel, ts)
        res = coll.objects.coll.remove(coll.objects.get_query(time_slot=ts))
        if res['n'] == 0:
            zero_counts += 1
        current_dt = current_dt - time_step
        total_records_removed += res['n']
        ts = datetime_to_timeslot(current_dt, level)
        LOGGER.info(
            "purging Q:: collection: %s; func: %s; timedelta: %s; date: %s; level: %s; records removed: %s",
            coll.__name__,
            inspect.stack()[0][3],
            datetime.now() - t0, current_dt, level, res['n'])
    return total_records_removed
Ejemplo n.º 3
0
    def run(self):
        inp_queue = self.inp_queue
        start_time = time.time()

        while not self.stopped():
            # make sure we intercept all errors
            try:
                task = inp_queue.get()
                if task is self.QUIT or task == 'QUIT':
                    LOGGER.debug('received QUIT signal %s' % self)
                    break
                start_time = time.time()
                self._busy = True  # Just started doing our post processing
                post_fields = self.preprocess_post(task)
                if not post_fields:
                    LOGGER.warning('no post_fields in: %s', task)
                    continue

                # LOGGER.debug('creating post %r %s', post_fields.get('content'), inp_queue.qsize())

                if self.assign_channels(post_fields):
                    self.create_post(**post_fields)
                else:
                    LOGGER.info('skipping post %r' %
                                post_fields.get('content'))
                    self.inc_skipped()

                self._busy = False  # Just Finished doing our post processing
            except Exception, err:
                LOGGER.error(err, exc_info=True)
                pass

            finally:
Ejemplo n.º 4
0
    def run(self):
        self.ds_client = None

        while not self.stopped():
            try:
                del self.ds_client  # to garbage-collect the old client ASAP
                self._running = False

                if not get_var('ON_TEST'):
                    self.ds_client = DatasiftClient(ds_login=self.ds_login,
                                                    ds_api_key=self.ds_api_key,
                                                    bot_instance=self,
                                                    sanity_checker=self.checker)
                else:
                    self.ds_client = TestDatasiftClient(bot_instance=self)

                self.ds_client.connect()
                self._running = True

                LOGGER.info('connected to %s', self.ds_client.WEBSOCKET_BASE_URL)

                self.checker.set_client(self.ds_client)
                self.ds_subscriber.set_client(self.ds_client)

                self.ds_client.run()  # receives posts from Datasift
            except Exception as e:
                LOGGER.error(e, exc_info=True)
                sleep(5)  # wait a bit on any unexpected error
Ejemplo n.º 5
0
    def build_regression_pipe(self, pipe):
        F = self.train_class.F
        reward = '$' + F.reward
        context = '$' + F.context
        # context_vector = '$' + F.context_vector
        # action_vector = '$' + F.action_vector
        action = '$' + F.action

        context_f = []
        action_f = []
        for feature in self.FEATURES:
            if 'context' in feature:
                context_f.append(feature.replace('context:', ''))
            elif 'action' in feature:
                action_f.append(feature.replace('action:', ''))

        project_regr = {'$project': {'reward': '$reward', '_id': '$_id'}}
        [
            project_regr['$project'].update(
                {'ctx_' + _feature: '$ctx.' + _feature})
            for _feature in context_f
        ]
        [
            project_regr['$project'].update(
                {'act_' + _feature: '$act.' + _feature})
            for _feature in action_f
        ]

        pipe.append(project_regr)

        # No grouping -- only aggregating features data
        group_regr = {'$group': {'_id': {}, 'count': {'$sum': 1}}}

        [
            group_regr['$group'].update({
                'ctx_' + _feature: {
                    '$push': {
                        # "key": action_vector + '.' + _feature,
                        "value": context + '_' + _feature,
                        "reward": reward
                    }
                }
            }) for _feature in context_f
        ]

        [
            group_regr['$group'].update({
                'act_' + _feature: {
                    '$push': {
                        # "key":  context_vector + '.' + _feature,
                        "value": action + '_' + _feature,
                        "reward": reward
                    }
                }
            }) for _feature in action_f
        ]

        pipe.append(group_regr)
        LOGGER.info("Executing aggregation query: " + str(pipe))
        return pipe
Ejemplo n.º 6
0
 def acquire_for_stream(self, ref):
     LOGGER.info(u"Acquiring auth for stream %s" % ref)
     with self._lock:
         auth = self._resource.get()
         self._in_use[ref.key] = auth
         LOGGER.debug(u"In use: {}".format(self._in_use))
     return auth
Ejemplo n.º 7
0
def postprocess_events(user):
    from solariat_bottle.db.user import set_user

    set_user(user)
    account = user.account

    start = time.time()
    try:
        _postprocess_events(account)

        # TODO: to remove
        # [11/11/16, 5:11:01 PM] Bogdan Neacsa: Hey Vlad, the way the architecture is going to work this is a scheduled task
        # [11/11/16, 5:11:10 PM] Bogdan Neacsa: So it will just restart automatically on next iteration
        # stop = False
        # while not stop:
        #     _postprocess_events(account)
        #     account.reload()
        #     if account.event_processing_needs_restart:
        #         account.update(event_processing_needs_restart=False)
        #         continue
        #     stop = True
    except:
        LOGGER.critical('[DynamicEvents Postprocessing] Cannot process events:', exc_info=True)
    finally:
        account.update(event_processing_lock=False, event_processing_needs_restart=False)

    LOGGER.info('[DynamicEvents Postprocessing] took: %s sec', time.time() - start)
Ejemplo n.º 8
0
def log_staff_request(user):
    if user and (user.is_superuser or user.is_staff) \
            and request.path.startswith('/configure'):
        from solariat.utils.logger import format_request
        from solariat_bottle.settings import LOGGER

        LOGGER.info("\n" + format_request(request, user))
Ejemplo n.º 9
0
    def post_authenticated(self, url, json=None, number_of_retries=None):
        assert self.options and self.options.username and self.options.password

        authtoken = None
        expired = None

        while True:
            if not authtoken:
                authtoken = self.get_authtoken(expired)
                expired = None
            auth_url = self.apply_token(url, json, authtoken)
            try:
                return self.post(auth_url, json=json)
            except ApplicationError as err:
                if str(err) == 'Auth token %s is expired' % authtoken:
                    LOGGER.info(err)
                    expired = authtoken
                    authtoken = None
                else:
                    LOGGER.exception(err)
                    break
            except UnauthorizedRequestError as err:
                LOGGER.warning(err, exc_info=True)
                expired = authtoken
                authtoken = None
            except InfrastructureError as err:
                LOGGER.exception(err)
                if number_of_retries is None:
                    time.sleep(self.sleep_timeout)
                elif isinstance(number_of_retries,
                                int) and number_of_retries > 0:
                    number_of_retries -= 1
                else:
                    break
Ejemplo n.º 10
0
def purge_channel_stats(channel):
    days = get_var('CHANNEL_STATS_KEEP_DAYS')

    start_date = datetime(year=2012, month=1, day=1)
    end_date = now() - timedelta(days=days)
    # end_date   = datetime(year=end_date.year, month=end_date.month, day=1)
    timeslots  = (
        (datetime_to_timeslot(start_date, level), datetime_to_timeslot(end_date, level)) \
        for level in TIMESLOT_LEVEL_NAMES
    )

    F = ChannelStats.F
    removed_count = 0
    for start_ts, end_ts in timeslots:
        t0 = datetime.now()
        res = ChannelStats.objects.coll.remove({
            F('time_slot'): {
                '$lte': end_ts,
                '$gt': start_ts
            },
            F('channel'): channel.id
        })
        LOGGER.info(
            "purging Q:: channel: %s; collection: ChannelStats; func: %s; timedelta: %s"
            % (channel.title, inspect.stack()[0][3], datetime.now() - t0))
        removed_count += res['n']
    return removed_count
Ejemplo n.º 11
0
    def run(self):

        post_data = None
        authtoken = None
        expired_authtoken = None

        while True:
            if not authtoken:
                authtoken = self.get_authtoken(expired_authtoken)
            if not post_data:
                post_data = self.task_queue.get()

            url = '%s/api/v1.2/posts?token=%s' % (self.options.url, authtoken)
            headers = {'Content-Type': 'application/json'}

            try:
                self.handle_connection(url, 'POST', post_data, headers)
            except ApplicationError as err:
                if str(err) == 'Auth token %s is expired' % authtoken:
                    LOGGER.info(err)
                    expired_authtoken = authtoken
                    authtoken = None
                else:
                    LOGGER.error(err)
                    post_data = None
                    self.task_queue.task_done()
            except InfrastructureError as err:
                LOGGER.error(err, exc_info=True)
                gevent.sleep(self.sleep_timeout)
            else:
                post_data = None
                self.task_queue.task_done()
Ejemplo n.º 12
0
def trends_remove(counter):
    t0 = datetime.now()
    res = ChannelTopicTrends.objects.coll.remove({FT('gc_counter'): counter})
    LOGGER.info(
        "purging Q:: collection: ChannelTopicTrends; func: %s; timedelta: %s" %
        (inspect.stack()[0][3], datetime.now() - t0))
    return res
Ejemplo n.º 13
0
    def preprocess_post(self, event_json):
        if isinstance(event_json, (tuple, list)):
            message_type, data = event_json
            post_data = None
            preprocess = self.preprocessors.get(message_type)
            if preprocess is None:
                LOGGER.warn(u"Unknown message type: %s\nEvent is: %s" %
                            (message_type, event_json))
                return None

            try:
                post_data = preprocess(data)
            except:
                import traceback
                traceback.print_exc()
                LOGGER.warn(u"Error parsing tweet: %s" % unicode(event_json))

            if post_data:
                return post_data
            else:
                LOGGER.info(u"Twitter event: %s" % unicode(event_json))

        elif isinstance(event_json, dict):
            # already processed
            return event_json
        return None
Ejemplo n.º 14
0
def _postprocess_events(account):
    assert account.event_processing_lock
    from solariat_bottle.db.events.event import Event
    from solariat_bottle.db.journeys.customer_journey import CustomerJourney

    LOGGER.info('[DynamicEvents Postprocessing] Start re-processing '
                'ALL dynamic events because some of schemas were changed.')

    # Reset customer journey data
    CustomerJourney.objects.remove(account_id=account.id)
    # TODO: After account specific collection is done this should work just fine / uncomment
    # Event.objects.coll.update({'_id': {'$ne': 1}}, {'$set': {'_wp': False}}, multi=True)
    channels = account.get_current_channels()
    Event.objects.coll.update(
        {'_id': {'$ne': 1}, 'cs': {'$in': [c.id for c in channels]}},
        {'$set': {'_wp': False}},
        multi=True)

    from solariat_bottle.tasks.journeys import process_event_batch
    batch_size = 2000
    total_count = Event.objects.count()
    n_batches = total_count / batch_size + 1
    progress = 0
    for batch_nr in xrange(n_batches):
        process_event_batch(account.id, batch_size)
        progress += 100.0 / n_batches
        account.update(resync_progress=progress)
Ejemplo n.º 15
0
    def get_status(self):
        """ :return: The status of the current runner, based on the equivalent datasift status. """
        self.subscription.reload()
        if self.subscription.status in {
                SUBSCRIPTION_STOPPED, SUBSCRIPTION_ERROR
        }:
            return self.subscription.status

        status_data = self.get_subscription_status()
        LOGGER.info(status_data)
        if status_data is None or status_data is False:
            return SUBSCRIPTION_PENDING

        if status_data['status'] in ('init', 'queued', 'running'):
            chunks = [
                'status:' + str(dt['status']) + ', progress:' +
                str(dt['progress']) for dt in status_data.get('chunks', [])
            ]
            LOGGER.info("Query status is %s. Data chunks status are (%s)." %
                        (status_data['status'], chunks))

            self.subscription.update(status_data_historics=status_data,
                                     status=SUBSCRIPTION_RUNNING)
            return SUBSCRIPTION_RUNNING

        self.subscription.update(status_data_historics=status_data,
                                 status=SUBSCRIPTION_FINISHED)
        return SUBSCRIPTION_FINISHED
Ejemplo n.º 16
0
 def create_new_user(cls):
     data = dict(permissions=['publish_actions', 'read_stream'],
                 installed=True,
                 access_token="%s|%s" % (cls.app_id, cls.app_secret))
     user = requests.post("https://graph.facebook.com/v2.2/%s/accounts/test-users" % cls.app_id,
                          data=data).json()
     LOGGER.info("Created user " + str(user))
     return user
Ejemplo n.º 17
0
    def get_stage_path_class(self, journey):
        import json
        from solariat_bottle.db.journeys.journey_type import JourneyStageType

        metric_values = [
            json.loads(metric_value)
            for metric_value in self.analysis.metric_values
        ]
        # Load up actual stages
        journey_sequences = []
        for entry in metric_values:
            try:
                entry['stage'] = JourneyStageType.objects.get(
                    display_name=entry['stage']).display_name
                journey_sequence = journey.stage_sequence_names
                journey_sequences.append(journey_sequence)
            except JourneyStageType.DoesNotExist:
                # It's a strategy stage, need more specific aggregation for this
                from solariat_bottle.db.journeys.customer_journey import EVENT_STRATEGY, PLATFORM_STRATEGY

                query = {CustomerJourney.F.id: journey.id}
                for strategy in {EVENT_STRATEGY, PLATFORM_STRATEGY}:
                    # query[StrategyLabelInformation.F.strategy] = strategy
                    # pipeline = [
                    #     {'$match': query},
                    #     {'$group':
                    #             {
                    #                 '_id': {"journey_id": '$' + StrategyLabelInformation.F.customer_journey_id},
                    #                 'stage_sequence': {"$max": '$' + StrategyLabelInformation.F.stage_sequence_names},
                    #             }
                    #         }
                    # ]
                    # agg_results = StrategyLabelInformation.objects.coll.aggregate(pipeline)['result'][0]
                    journey_sequence = agg_results['stage_sequence']
                    journey_sequences.append(journey_sequence)
                break

        for class_idx, metric_value in enumerate(metric_values):
            step = metric_value['step']
            stage = metric_value['stage']
            for journey_sequence in journey_sequences:
                if len(journey_sequence) <= step:
                    LOGGER.info(
                        "Skipped sequence %s because shorter than step %s" %
                        (journey_sequence, step))
                    continue

                stage_at_step = journey_sequence[step]
                if stage_at_step != stage:
                    LOGGER.info(
                        "Skipped sequence %s because found stage %s at step %s instead of %s"
                        % (journey_sequence, stage_at_step, step, stage))
                    continue
                else:
                    return class_idx

        return self.analysis.IDX_SKIP  # Doesn't even matter
Ejemplo n.º 18
0
 def release_for_stream(self, ref):
     LOGGER.info(u"Releasing auth for stream %s" % ref)
     with self._lock:
         if ref.key in self._in_use:
             auth = self._in_use.pop(ref.key)
             self.put(auth)
         else:
             auth = None
         LOGGER.debug(u"In use: {}".format(self._in_use))
     return auth
Ejemplo n.º 19
0
def run_or_restart_postprocessing(user, msg):
    account = user.account
    account.reload()

    if account.event_processing_lock:
        account.update(event_processing_needs_restart=True)
        LOGGER.info(msg)
    else:
        account.update(event_processing_lock=True)
        postprocess_events.async(user)
Ejemplo n.º 20
0
 def add(self, job):
     with self.lock:
         if job.topic not in self.config.supported_topics:
             raise RegistryError('Job: %s topic "%s" is not supported' %
                                 (job.name, job.topic))
         if job.name in self.registry:
             raise RegistryError('Job: %s already exists in registry' %
                                 job.name)
         self.registry[job.name] = job
         LOGGER.info('Job with name "%s" added', job.name)
Ejemplo n.º 21
0
 def get(self, name):
     with self.lock:
         if name not in self.registry:
             LOGGER.info('No Job registered for: %s, trying to import.',
                         name)
             try:
                 self._import_module(name)
             except ImportError:
                 LOGGER.error('Cannot import job module:', exc_info=True)
                 raise RegistryError('No Job registered for: %s' % name)
         return self.registry[name]
Ejemplo n.º 22
0
def mark_items_to_keep_query(doc_ids):
    t0 = datetime.now()
    update = ChannelHotTopics.objects.coll.update({'_id': {
        '$in': doc_ids
    }}, {'$set': {
        F('gc_counter'): MARKED_TO_KEEP
    }},
                                                  multi=True)
    LOGGER.info(
        "purging Q:: collection: ChannelHotTopics; func: %s; timedelta: %s" %
        (inspect.stack()[0][3], datetime.now() - t0))
    return update
Ejemplo n.º 23
0
 def _add_feed_thread(self):
     qsize = self.feed_queue.qsize()
     total_threads = len(self.feed_api_threads)
     if (total_threads == 0
             or qsize > 1) and total_threads < self.max_workers:
         thread = FeedApiThread(
             args=(self.feed_queue, self.options),
             kwargs={'User-Agent': '%s-%s' % (self.user_agent, self._num)})
         thread.daemon = True
         thread.start()
         self.feed_api_threads.append(thread)
         LOGGER.info("Added FeedApiThread")
Ejemplo n.º 24
0
    def run(self):

        post_data = None
        authtoken = None
        expired_authtoken = None

        while True:
            if not authtoken:
                authtoken = self.get_authtoken(expired_authtoken)
            if not post_data:
                post_data = self.task_queue.get()
            # This is used both by datasift and by twitter_bot_dm.
            # Just be safe, and in case we recieve a dict with no 'channels' key
            # do the processing here (as is the case with twitter_bot),
            # otherwise assume it was done before (as is the case with datasift_bot.
            if isinstance(post_data, dict) and 'channels' not in post_data:
                channels = handle_post('Twitter', post_data['user_profile'],
                                       post_data)
                if channels:
                    channels = [str(c.id) for c in channels]
                    post_data['channels'] = channels

                # we need this for getting channels only only
                if 'direct_message' in post_data:
                    del post_data['direct_message']

                post_data = json.dumps(post_data)

            self.reset_buff()
            self.conn.setopt(pycurl.POSTFIELDS, post_data)
            self.conn.setopt(
                pycurl.URL,
                '%s/api/v1.2/posts?token=%s' % (self.options.url, authtoken))
            self.conn.setopt(pycurl.HTTPHEADER,
                             ['Content-Type: application/json'])

            try:
                self.handle_connection()
            except ApplicationError as err:
                if str(err) == 'Auth token %s is expired' % authtoken:
                    LOGGER.info(err)
                    expired_authtoken = authtoken
                    authtoken = None
                else:
                    LOGGER.error(err)
                    post_data = None
                    self.task_queue.task_done()
            except InfrastructureError as err:
                LOGGER.error(err)
                time.sleep(self.sleep_timeout)
            else:
                post_data = None
                self.task_queue.task_done()
Ejemplo n.º 25
0
    def _handle_tracking(self, action, pages=None, events=None):
        LOGGER.info(u"Invoked {}[{}]._handle_tracking action={} pages={} events={}".format(
            self.__class__.__name__, self.id, action, pages, events))

        if pages == 'all':
            pages = self.facebook_page_ids
        if events == 'all':
            events = self.tracked_fb_event_ids
        if pages:
            FacebookTracking.objects.handle_channel_event(action, self, pages, PAGE)
        if events:
            FacebookTracking.objects.handle_channel_event(action, self, events, EVENT)
Ejemplo n.º 26
0
def reset_db():
    # Run this once so that collections are totally reset and indexes applied
    setup_db_connection({"DB_NAME": TEST_DB, "TEST_DB_NAME": TEST_DB})
    db = get_connection()

    for coll_name in db.collection_names():
        if coll_name != RPC_COLLECTION and not coll_name.startswith('system.'):
            coll = db[coll_name]
            coll.drop()

    LOGGER.info("Creating indexes...")
    indexctl.put_indexes([], True)
Ejemplo n.º 27
0
def get_insight(user, analyzer_id):
    if not analyzer_id:
        return jsonify(ok=True, error="missing parameter analyzer_id")
    if request.method == 'DELETE':
        removed = InsightsAnalysis.objects.remove(id=analyzer_id)
        LOGGER.info("Removing analysis finished successfully: " + str(removed))
        return jsonify(ok=True if removed['ok'] else False, message="Successfully removed %s analysis." % removed['n'])
    else:
        try:
            return jsonify(ok=True, item=InsightsAnalysis.objects.get(analyzer_id).to_dict())
        except InsightsAnalysis.DoesNotExist, ex:
            return jsonify(ok=False, error="No Analysis found with id = %s" % analyzer_id)
Ejemplo n.º 28
0
    def create(self, user, data_loader):
        discovered_schema = data_loader.read_schema()
        assert isinstance(data_loader, SchemaBasedDataLoader)
        schema_entity = self.profile_cls.create(self.parent.id)
        schema_entity.add_perm(user)
        start = time.time()
        schema_entity.update(discovered_schema=discovered_schema)
        LOGGER.info('Analazing of input data took: %s', time.time() - start)

        finish_data_load. async (user, schema_entity, data_loader)
        # finish_data_load(user, schema_entity, data_loader)
        return schema_entity
Ejemplo n.º 29
0
def tw_process_historic_subscription(subscription):
    from solariat_bottle.daemons.twitter.historics.subscriber import TwitterHistoricsSubscriber
    from solariat_bottle.settings import LOGGER
    from datetime import datetime

    start_time = datetime.now()
    subscriber = TwitterHistoricsSubscriber(subscription)

    LOGGER.info("Subscription %s started at %s." %
                (subscription.id, start_time))
    subscriber.start_historic_load()
    LOGGER.info("Subscription %s finished. Elapsed time %s" %
                (subscription.id, datetime.now() - start_time))
Ejemplo n.º 30
0
 def save_local_models(self):
     if hasattr(self, '_clf'):
         start_ts = dt.now()
         for key, local_model in self.clf._model_cache.items():
             local_model.save()
             # hack for float keys
             # because keys in model.clf_map should be strings
             # Alex Gogolev
             if isinstance(key, float):
                 key = str(int(key))
             self.clf_map[str(key)] = local_model.id
         LOGGER.info("Saved %s LocalModel-s, timedelta: %s",
                     len(self.clf._model_cache),
                     dt.now() - start_ts)