def to_dict(self, include_cardinalities=False, fields2show=None, **kw): res = super(SchemaBased, self).to_dict(fields2show) res['is_locked'] = self.is_locked if not include_cardinalities: res.pop('cardinalities', None) return res cardinalities = res['cardinalities'] or {} for key, val in cardinalities.iteritems(): if key not in self.schema_field_types: continue if self.schema_field_types[key] != TYPE_TIMESTAMP: continue if VALUES not in val: continue LOGGER.info('Refreshing cardinalities for %s', str(val)) val[VALUES] = [ dt.strftime('%m/%d/%Y %H:%M:%S') if isinstance( dt, datetime.date) else dt for dt in val.get(VALUES, []) ] return res
def purge_outdated_trends_stats(coll, channel, level, delta): initial_timedelta_arg_name = {"hour": "days", "day": "months"}[level] timedelta_arg_name = {"hour": "hours", "day": "days"}[level] start_dt = now() - relativedelta(**{initial_timedelta_arg_name: delta}) current_dt = start_dt time_step = relativedelta(**{timedelta_arg_name: 1}) ts = datetime_to_timeslot(current_dt, level) zero_counts = 0 total_records_removed = 0 EMPTY_SLOTS_NUMBER = 10 while zero_counts <= EMPTY_SLOTS_NUMBER: t0 = datetime.now() channel_ts_val = ChannelTopicTrends.make_channel_ts(channel, ts) res = coll.objects.coll.remove(coll.objects.get_query(time_slot=ts)) if res['n'] == 0: zero_counts += 1 current_dt = current_dt - time_step total_records_removed += res['n'] ts = datetime_to_timeslot(current_dt, level) LOGGER.info( "purging Q:: collection: %s; func: %s; timedelta: %s; date: %s; level: %s; records removed: %s", coll.__name__, inspect.stack()[0][3], datetime.now() - t0, current_dt, level, res['n']) return total_records_removed
def run(self): inp_queue = self.inp_queue start_time = time.time() while not self.stopped(): # make sure we intercept all errors try: task = inp_queue.get() if task is self.QUIT or task == 'QUIT': LOGGER.debug('received QUIT signal %s' % self) break start_time = time.time() self._busy = True # Just started doing our post processing post_fields = self.preprocess_post(task) if not post_fields: LOGGER.warning('no post_fields in: %s', task) continue # LOGGER.debug('creating post %r %s', post_fields.get('content'), inp_queue.qsize()) if self.assign_channels(post_fields): self.create_post(**post_fields) else: LOGGER.info('skipping post %r' % post_fields.get('content')) self.inc_skipped() self._busy = False # Just Finished doing our post processing except Exception, err: LOGGER.error(err, exc_info=True) pass finally:
def run(self): self.ds_client = None while not self.stopped(): try: del self.ds_client # to garbage-collect the old client ASAP self._running = False if not get_var('ON_TEST'): self.ds_client = DatasiftClient(ds_login=self.ds_login, ds_api_key=self.ds_api_key, bot_instance=self, sanity_checker=self.checker) else: self.ds_client = TestDatasiftClient(bot_instance=self) self.ds_client.connect() self._running = True LOGGER.info('connected to %s', self.ds_client.WEBSOCKET_BASE_URL) self.checker.set_client(self.ds_client) self.ds_subscriber.set_client(self.ds_client) self.ds_client.run() # receives posts from Datasift except Exception as e: LOGGER.error(e, exc_info=True) sleep(5) # wait a bit on any unexpected error
def build_regression_pipe(self, pipe): F = self.train_class.F reward = '$' + F.reward context = '$' + F.context # context_vector = '$' + F.context_vector # action_vector = '$' + F.action_vector action = '$' + F.action context_f = [] action_f = [] for feature in self.FEATURES: if 'context' in feature: context_f.append(feature.replace('context:', '')) elif 'action' in feature: action_f.append(feature.replace('action:', '')) project_regr = {'$project': {'reward': '$reward', '_id': '$_id'}} [ project_regr['$project'].update( {'ctx_' + _feature: '$ctx.' + _feature}) for _feature in context_f ] [ project_regr['$project'].update( {'act_' + _feature: '$act.' + _feature}) for _feature in action_f ] pipe.append(project_regr) # No grouping -- only aggregating features data group_regr = {'$group': {'_id': {}, 'count': {'$sum': 1}}} [ group_regr['$group'].update({ 'ctx_' + _feature: { '$push': { # "key": action_vector + '.' + _feature, "value": context + '_' + _feature, "reward": reward } } }) for _feature in context_f ] [ group_regr['$group'].update({ 'act_' + _feature: { '$push': { # "key": context_vector + '.' + _feature, "value": action + '_' + _feature, "reward": reward } } }) for _feature in action_f ] pipe.append(group_regr) LOGGER.info("Executing aggregation query: " + str(pipe)) return pipe
def acquire_for_stream(self, ref): LOGGER.info(u"Acquiring auth for stream %s" % ref) with self._lock: auth = self._resource.get() self._in_use[ref.key] = auth LOGGER.debug(u"In use: {}".format(self._in_use)) return auth
def postprocess_events(user): from solariat_bottle.db.user import set_user set_user(user) account = user.account start = time.time() try: _postprocess_events(account) # TODO: to remove # [11/11/16, 5:11:01 PM] Bogdan Neacsa: Hey Vlad, the way the architecture is going to work this is a scheduled task # [11/11/16, 5:11:10 PM] Bogdan Neacsa: So it will just restart automatically on next iteration # stop = False # while not stop: # _postprocess_events(account) # account.reload() # if account.event_processing_needs_restart: # account.update(event_processing_needs_restart=False) # continue # stop = True except: LOGGER.critical('[DynamicEvents Postprocessing] Cannot process events:', exc_info=True) finally: account.update(event_processing_lock=False, event_processing_needs_restart=False) LOGGER.info('[DynamicEvents Postprocessing] took: %s sec', time.time() - start)
def log_staff_request(user): if user and (user.is_superuser or user.is_staff) \ and request.path.startswith('/configure'): from solariat.utils.logger import format_request from solariat_bottle.settings import LOGGER LOGGER.info("\n" + format_request(request, user))
def post_authenticated(self, url, json=None, number_of_retries=None): assert self.options and self.options.username and self.options.password authtoken = None expired = None while True: if not authtoken: authtoken = self.get_authtoken(expired) expired = None auth_url = self.apply_token(url, json, authtoken) try: return self.post(auth_url, json=json) except ApplicationError as err: if str(err) == 'Auth token %s is expired' % authtoken: LOGGER.info(err) expired = authtoken authtoken = None else: LOGGER.exception(err) break except UnauthorizedRequestError as err: LOGGER.warning(err, exc_info=True) expired = authtoken authtoken = None except InfrastructureError as err: LOGGER.exception(err) if number_of_retries is None: time.sleep(self.sleep_timeout) elif isinstance(number_of_retries, int) and number_of_retries > 0: number_of_retries -= 1 else: break
def purge_channel_stats(channel): days = get_var('CHANNEL_STATS_KEEP_DAYS') start_date = datetime(year=2012, month=1, day=1) end_date = now() - timedelta(days=days) # end_date = datetime(year=end_date.year, month=end_date.month, day=1) timeslots = ( (datetime_to_timeslot(start_date, level), datetime_to_timeslot(end_date, level)) \ for level in TIMESLOT_LEVEL_NAMES ) F = ChannelStats.F removed_count = 0 for start_ts, end_ts in timeslots: t0 = datetime.now() res = ChannelStats.objects.coll.remove({ F('time_slot'): { '$lte': end_ts, '$gt': start_ts }, F('channel'): channel.id }) LOGGER.info( "purging Q:: channel: %s; collection: ChannelStats; func: %s; timedelta: %s" % (channel.title, inspect.stack()[0][3], datetime.now() - t0)) removed_count += res['n'] return removed_count
def run(self): post_data = None authtoken = None expired_authtoken = None while True: if not authtoken: authtoken = self.get_authtoken(expired_authtoken) if not post_data: post_data = self.task_queue.get() url = '%s/api/v1.2/posts?token=%s' % (self.options.url, authtoken) headers = {'Content-Type': 'application/json'} try: self.handle_connection(url, 'POST', post_data, headers) except ApplicationError as err: if str(err) == 'Auth token %s is expired' % authtoken: LOGGER.info(err) expired_authtoken = authtoken authtoken = None else: LOGGER.error(err) post_data = None self.task_queue.task_done() except InfrastructureError as err: LOGGER.error(err, exc_info=True) gevent.sleep(self.sleep_timeout) else: post_data = None self.task_queue.task_done()
def trends_remove(counter): t0 = datetime.now() res = ChannelTopicTrends.objects.coll.remove({FT('gc_counter'): counter}) LOGGER.info( "purging Q:: collection: ChannelTopicTrends; func: %s; timedelta: %s" % (inspect.stack()[0][3], datetime.now() - t0)) return res
def preprocess_post(self, event_json): if isinstance(event_json, (tuple, list)): message_type, data = event_json post_data = None preprocess = self.preprocessors.get(message_type) if preprocess is None: LOGGER.warn(u"Unknown message type: %s\nEvent is: %s" % (message_type, event_json)) return None try: post_data = preprocess(data) except: import traceback traceback.print_exc() LOGGER.warn(u"Error parsing tweet: %s" % unicode(event_json)) if post_data: return post_data else: LOGGER.info(u"Twitter event: %s" % unicode(event_json)) elif isinstance(event_json, dict): # already processed return event_json return None
def _postprocess_events(account): assert account.event_processing_lock from solariat_bottle.db.events.event import Event from solariat_bottle.db.journeys.customer_journey import CustomerJourney LOGGER.info('[DynamicEvents Postprocessing] Start re-processing ' 'ALL dynamic events because some of schemas were changed.') # Reset customer journey data CustomerJourney.objects.remove(account_id=account.id) # TODO: After account specific collection is done this should work just fine / uncomment # Event.objects.coll.update({'_id': {'$ne': 1}}, {'$set': {'_wp': False}}, multi=True) channels = account.get_current_channels() Event.objects.coll.update( {'_id': {'$ne': 1}, 'cs': {'$in': [c.id for c in channels]}}, {'$set': {'_wp': False}}, multi=True) from solariat_bottle.tasks.journeys import process_event_batch batch_size = 2000 total_count = Event.objects.count() n_batches = total_count / batch_size + 1 progress = 0 for batch_nr in xrange(n_batches): process_event_batch(account.id, batch_size) progress += 100.0 / n_batches account.update(resync_progress=progress)
def get_status(self): """ :return: The status of the current runner, based on the equivalent datasift status. """ self.subscription.reload() if self.subscription.status in { SUBSCRIPTION_STOPPED, SUBSCRIPTION_ERROR }: return self.subscription.status status_data = self.get_subscription_status() LOGGER.info(status_data) if status_data is None or status_data is False: return SUBSCRIPTION_PENDING if status_data['status'] in ('init', 'queued', 'running'): chunks = [ 'status:' + str(dt['status']) + ', progress:' + str(dt['progress']) for dt in status_data.get('chunks', []) ] LOGGER.info("Query status is %s. Data chunks status are (%s)." % (status_data['status'], chunks)) self.subscription.update(status_data_historics=status_data, status=SUBSCRIPTION_RUNNING) return SUBSCRIPTION_RUNNING self.subscription.update(status_data_historics=status_data, status=SUBSCRIPTION_FINISHED) return SUBSCRIPTION_FINISHED
def create_new_user(cls): data = dict(permissions=['publish_actions', 'read_stream'], installed=True, access_token="%s|%s" % (cls.app_id, cls.app_secret)) user = requests.post("https://graph.facebook.com/v2.2/%s/accounts/test-users" % cls.app_id, data=data).json() LOGGER.info("Created user " + str(user)) return user
def get_stage_path_class(self, journey): import json from solariat_bottle.db.journeys.journey_type import JourneyStageType metric_values = [ json.loads(metric_value) for metric_value in self.analysis.metric_values ] # Load up actual stages journey_sequences = [] for entry in metric_values: try: entry['stage'] = JourneyStageType.objects.get( display_name=entry['stage']).display_name journey_sequence = journey.stage_sequence_names journey_sequences.append(journey_sequence) except JourneyStageType.DoesNotExist: # It's a strategy stage, need more specific aggregation for this from solariat_bottle.db.journeys.customer_journey import EVENT_STRATEGY, PLATFORM_STRATEGY query = {CustomerJourney.F.id: journey.id} for strategy in {EVENT_STRATEGY, PLATFORM_STRATEGY}: # query[StrategyLabelInformation.F.strategy] = strategy # pipeline = [ # {'$match': query}, # {'$group': # { # '_id': {"journey_id": '$' + StrategyLabelInformation.F.customer_journey_id}, # 'stage_sequence': {"$max": '$' + StrategyLabelInformation.F.stage_sequence_names}, # } # } # ] # agg_results = StrategyLabelInformation.objects.coll.aggregate(pipeline)['result'][0] journey_sequence = agg_results['stage_sequence'] journey_sequences.append(journey_sequence) break for class_idx, metric_value in enumerate(metric_values): step = metric_value['step'] stage = metric_value['stage'] for journey_sequence in journey_sequences: if len(journey_sequence) <= step: LOGGER.info( "Skipped sequence %s because shorter than step %s" % (journey_sequence, step)) continue stage_at_step = journey_sequence[step] if stage_at_step != stage: LOGGER.info( "Skipped sequence %s because found stage %s at step %s instead of %s" % (journey_sequence, stage_at_step, step, stage)) continue else: return class_idx return self.analysis.IDX_SKIP # Doesn't even matter
def release_for_stream(self, ref): LOGGER.info(u"Releasing auth for stream %s" % ref) with self._lock: if ref.key in self._in_use: auth = self._in_use.pop(ref.key) self.put(auth) else: auth = None LOGGER.debug(u"In use: {}".format(self._in_use)) return auth
def run_or_restart_postprocessing(user, msg): account = user.account account.reload() if account.event_processing_lock: account.update(event_processing_needs_restart=True) LOGGER.info(msg) else: account.update(event_processing_lock=True) postprocess_events.async(user)
def add(self, job): with self.lock: if job.topic not in self.config.supported_topics: raise RegistryError('Job: %s topic "%s" is not supported' % (job.name, job.topic)) if job.name in self.registry: raise RegistryError('Job: %s already exists in registry' % job.name) self.registry[job.name] = job LOGGER.info('Job with name "%s" added', job.name)
def get(self, name): with self.lock: if name not in self.registry: LOGGER.info('No Job registered for: %s, trying to import.', name) try: self._import_module(name) except ImportError: LOGGER.error('Cannot import job module:', exc_info=True) raise RegistryError('No Job registered for: %s' % name) return self.registry[name]
def mark_items_to_keep_query(doc_ids): t0 = datetime.now() update = ChannelHotTopics.objects.coll.update({'_id': { '$in': doc_ids }}, {'$set': { F('gc_counter'): MARKED_TO_KEEP }}, multi=True) LOGGER.info( "purging Q:: collection: ChannelHotTopics; func: %s; timedelta: %s" % (inspect.stack()[0][3], datetime.now() - t0)) return update
def _add_feed_thread(self): qsize = self.feed_queue.qsize() total_threads = len(self.feed_api_threads) if (total_threads == 0 or qsize > 1) and total_threads < self.max_workers: thread = FeedApiThread( args=(self.feed_queue, self.options), kwargs={'User-Agent': '%s-%s' % (self.user_agent, self._num)}) thread.daemon = True thread.start() self.feed_api_threads.append(thread) LOGGER.info("Added FeedApiThread")
def run(self): post_data = None authtoken = None expired_authtoken = None while True: if not authtoken: authtoken = self.get_authtoken(expired_authtoken) if not post_data: post_data = self.task_queue.get() # This is used both by datasift and by twitter_bot_dm. # Just be safe, and in case we recieve a dict with no 'channels' key # do the processing here (as is the case with twitter_bot), # otherwise assume it was done before (as is the case with datasift_bot. if isinstance(post_data, dict) and 'channels' not in post_data: channels = handle_post('Twitter', post_data['user_profile'], post_data) if channels: channels = [str(c.id) for c in channels] post_data['channels'] = channels # we need this for getting channels only only if 'direct_message' in post_data: del post_data['direct_message'] post_data = json.dumps(post_data) self.reset_buff() self.conn.setopt(pycurl.POSTFIELDS, post_data) self.conn.setopt( pycurl.URL, '%s/api/v1.2/posts?token=%s' % (self.options.url, authtoken)) self.conn.setopt(pycurl.HTTPHEADER, ['Content-Type: application/json']) try: self.handle_connection() except ApplicationError as err: if str(err) == 'Auth token %s is expired' % authtoken: LOGGER.info(err) expired_authtoken = authtoken authtoken = None else: LOGGER.error(err) post_data = None self.task_queue.task_done() except InfrastructureError as err: LOGGER.error(err) time.sleep(self.sleep_timeout) else: post_data = None self.task_queue.task_done()
def _handle_tracking(self, action, pages=None, events=None): LOGGER.info(u"Invoked {}[{}]._handle_tracking action={} pages={} events={}".format( self.__class__.__name__, self.id, action, pages, events)) if pages == 'all': pages = self.facebook_page_ids if events == 'all': events = self.tracked_fb_event_ids if pages: FacebookTracking.objects.handle_channel_event(action, self, pages, PAGE) if events: FacebookTracking.objects.handle_channel_event(action, self, events, EVENT)
def reset_db(): # Run this once so that collections are totally reset and indexes applied setup_db_connection({"DB_NAME": TEST_DB, "TEST_DB_NAME": TEST_DB}) db = get_connection() for coll_name in db.collection_names(): if coll_name != RPC_COLLECTION and not coll_name.startswith('system.'): coll = db[coll_name] coll.drop() LOGGER.info("Creating indexes...") indexctl.put_indexes([], True)
def get_insight(user, analyzer_id): if not analyzer_id: return jsonify(ok=True, error="missing parameter analyzer_id") if request.method == 'DELETE': removed = InsightsAnalysis.objects.remove(id=analyzer_id) LOGGER.info("Removing analysis finished successfully: " + str(removed)) return jsonify(ok=True if removed['ok'] else False, message="Successfully removed %s analysis." % removed['n']) else: try: return jsonify(ok=True, item=InsightsAnalysis.objects.get(analyzer_id).to_dict()) except InsightsAnalysis.DoesNotExist, ex: return jsonify(ok=False, error="No Analysis found with id = %s" % analyzer_id)
def create(self, user, data_loader): discovered_schema = data_loader.read_schema() assert isinstance(data_loader, SchemaBasedDataLoader) schema_entity = self.profile_cls.create(self.parent.id) schema_entity.add_perm(user) start = time.time() schema_entity.update(discovered_schema=discovered_schema) LOGGER.info('Analazing of input data took: %s', time.time() - start) finish_data_load. async (user, schema_entity, data_loader) # finish_data_load(user, schema_entity, data_loader) return schema_entity
def tw_process_historic_subscription(subscription): from solariat_bottle.daemons.twitter.historics.subscriber import TwitterHistoricsSubscriber from solariat_bottle.settings import LOGGER from datetime import datetime start_time = datetime.now() subscriber = TwitterHistoricsSubscriber(subscription) LOGGER.info("Subscription %s started at %s." % (subscription.id, start_time)) subscriber.start_historic_load() LOGGER.info("Subscription %s finished. Elapsed time %s" % (subscription.id, datetime.now() - start_time))
def save_local_models(self): if hasattr(self, '_clf'): start_ts = dt.now() for key, local_model in self.clf._model_cache.items(): local_model.save() # hack for float keys # because keys in model.clf_map should be strings # Alex Gogolev if isinstance(key, float): key = str(int(key)) self.clf_map[str(key)] = local_model.id LOGGER.info("Saved %s LocalModel-s, timedelta: %s", len(self.clf._model_cache), dt.now() - start_ts)