def test_select_by_time_point_3(self): ''' Test with different post creation dates''' DAY_20131212 = utc(datetime(day=12, month=12, year=2013)) DAY_20131202 = utc(datetime(day=2, month=12, year=2013)) DAY_20131002 = utc(datetime(day=2, month=10, year=2013)) for d in [DAY_20131212, DAY_20131202, DAY_20131002]: self._create_db_post(_created=d, channel=self.channel, content='i need some carrot') # Test for 1 single day results = ChannelHotTopics.objects.by_time_span( channel=self.channel, from_ts=datetime_to_timeslot(DAY_20131212, level='day')) self.assertEqual(results[0]['term_count'], 1) # For a month results = ChannelHotTopics.objects.by_time_span( channel=self.channel, from_ts=datetime_to_timeslot(DAY_20131212, level='month')) self.assertEqual(results[0]['term_count'], 2) # For a different month results = ChannelHotTopics.objects.by_time_span( channel=self.channel, from_ts=datetime_to_timeslot(DAY_20131002, level='month')) self.assertEqual(results[0]['term_count'], 1)
def main(manager=manager): """ Supposed to be invoked by cron periodically """ # 1. check sleeping jobs _now = now() for job_status in JobStatus.objects.find(status=JobStatus.SLEEPING): if utc(job_status.awake_at) > _now: continue job = manager.registry.get(job_status.name) manager.producer.send_message(job.topic, job_status) LOGGER.info('Job: %s awakened and sent to execution.', job_status.id) # 2. check timed out jobs for job_status in JobStatus.objects.find(status=JobStatus.RUNNING): job = manager.registry.get(job_status.name) last_activity = job_status.last_activity or job_status.started_date if _now - utc(last_activity) < timedelta(seconds=job.timeout): continue job_status.update(completion_date=now(), status=JobStatus.TERMINATED) LOGGER.info('Job: %s terminated. No activity last %s seconds.', job_status.id, job.timeout) if job.terminate_handler: try: job.terminate_handler(*job_status.args, **job_status.kwargs) LOGGER.info('terminate_handler complete for Job: %s.', job_status.id) except Exception as ex: LOGGER.error('Error executing terminate_handler: %s', ex, exc_info=True)
def lookup_history(self, event, lookback_window): actor_num, _ = unpack_event_id(event.id) id_lower_bound = pack_event_id(actor_num, utc(event._created - timedelta(seconds=lookback_window))) id_upper_bound = pack_event_id(actor_num, utc(event._created)) event_sequence = self.find(id__lte=id_upper_bound, id__gte=id_lower_bound)[:] return event_sequence
def test_posts_filtered_by_date(self): content = "I need a laptop @screen_name test" _from = utc(datetime(2016, 3, 14, 19, 00, 00)) _to = utc(datetime(2016, 3, 14, 20, 00, 00)) posts_at = [ _from - timedelta(seconds=1), # doesn't lie within 'from/to' range _from, # 1 _from + timedelta(seconds=1), # 2 _to - timedelta(seconds=1), # 3 _to, # 'to' in query is open interval _to + timedelta(seconds=1), ] for _created in posts_at: post = self._create_db_post(content=content, _created=_created) posts = self.get_post_details({ "channel_id": str(self.channel.id), "from": _from.strftime("%Y-%m-%d %H:%M:%S"), "to": _to.strftime("%Y-%m-%d %H:%M:%S") }) eq_(len(posts), 3)
def events_for_actor(self, start, end, actor_num): id_lower_bound = pack_event_id(actor_num, utc(start)) id_upper_bound = pack_event_id(actor_num, utc(end)) if id_lower_bound == id_upper_bound: res = self.find(id=id_upper_bound) else: res = self.find(id__lte=id_upper_bound, id__gte=id_lower_bound) return res
def test_to_mongo(self): facet = facets.DateTimeFacet(FooBar.fields['created_at']) q = facet.to_mongo(Query['created_at']) expected = {'crtd': { '$gte': utc(datetime(2016, 8, 10)), '$lt': utc(datetime(2016, 8, 11)) }} eq_(q, expected)
def test_get_query(self): mongo_query = facets.FacetUI.get_query(FooBar.fields, Query) expected = { 'nm': {'$in': ['Purchasing', 'Booking']}, 'stts': {'$in': ['deactivated', 'suspended']}, 'cntr': {'$gte': 10, '$lt': 20}, 'crtd': {'$gte': utc(datetime(2016, 8, 10)), '$lt': utc(datetime(2016, 8, 11))}, 'updtd': {'$gte': utc(datetime(2016, 8, 11)), '$lt': utc(datetime(2016, 8, 12))}, 'actv': {'$in': [True]}, 'stgs': {'$all': ['stage1', 'stage2']}, } eq_(mongo_query, expected)
def filters_fulfilled(self): start_date = self.filters.get('start_date') end_date = self.filters.get('end_date') predicates = [] p = predicates.append if start_date: p(utc(self.timeline_min_date) <= utc(start_date)) if end_date: p(utc(self.timeline_max_date) >= utc(end_date)) res = all(predicates) LOGGER.debug('[:::: filters_fulfilled ::::] %s', res) return res
def _filter_tweet(self, tweet): predicates = [] p = predicates.append tweet_date = parse_datetime(tweet['created_at']) start_date = self.filters.get('start_date') end_date = self.filters.get('end_date') if start_date: p(tweet_date >= utc(start_date)) if end_date: p(tweet_date <= utc(end_date)) return all(predicates)
def construct_feature_space(self, event, features_metadata=None): actor_num, _ = unpack_event_id(event.id) id_lower_bound = pack_event_id(actor_num, utc(event._created - timedelta(seconds=self.lookback_window))) id_upper_bound = pack_event_id(actor_num, utc(event._created)) event_sequence = Event.objects(id__lte=id_upper_bound, id__gte=id_lower_bound)[:] vector_space = [] for event in event_sequence: event_vector = self.vectorize(event, features_metadata) if event_vector is not None: vector_space.append(event_vector) return vector_space
def test_to_mongo(self): query = { 'from': '2016-08-15', 'to': '2016-08-16', } facet = facets.DateRangeFacet(FooBar.fields['created_at']) eq_(facet.to_mongo(query), {'crtd': {'$gte': utc(datetime(2016, 8, 15)), '$lt': utc(datetime(2016, 8, 16))}})
def purge_days(channel): ''' From now, purge days that we want to maintain in our history, that have not been purged yet. ''' # for all the days in the intersection between [last_purged, today], [3 days ago, today] if channel.last_purged: range_start = utc(channel.last_purged) else: range_start = now() - relativedelta(days=14) days_to_purge = list(gen_timeslots(range_start, now(), level='day')) trend_stats = [0, 0, 0] topic_stats = [0, 0, 0] for day in days_to_purge: topic_res = mark_and_sweep_topics(channel, day) topic_stats = [x + y for x, y in zip(topic_stats, topic_res)] #LOGGER.debug("TOPIC STATS: %s", topic_res) trend_res = purge_corresponding_trends(channel=channel, timeslot=day) trend_stats = [x + y for x, y in zip(trend_stats, trend_res)] return days_to_purge, topic_stats, trend_stats
def purge_months(channel): ''' From now, purge months that we want to maintain in our history, that have not been purged yet. ''' if channel.last_purged: range_start = utc(channel.last_purged) else: range_start = now() - relativedelta(months=2) mday = localtime().tm_mday if mday > 7: range_end = now() else: range_end = now() - relativedelta(months=1) months_to_purge = [] trend_stats = [0, 0, 0] topic_stats = [0, 0, 0] if range_start <= range_end: months_to_purge = list( gen_timeslots(range_start, range_end, level='month')) for month in months_to_purge: topic_res = mark_and_sweep_topics(channel, month) topic_stats = [x + y for x, y in zip(topic_stats, topic_res)] #LOGGER.debug("TOPIC STATS: %s", topic_res) trend_res = purge_corresponding_trends(channel=channel, timeslot=month) trend_stats = [x + y for x, y in zip(trend_stats, trend_res)] return months_to_purge, topic_stats, trend_stats
def progress(self): """Returns progress in percents""" if self._done is True: return 100 # before first run of execute_request() if not self.result and not self._progress: return 0 # when all tweets were fetched on previous iteration, or state restored elif not self.result: return self._progress # get progress comparing filters date interval and current min date start_date = self.filters.get('start_date') end_date = self.filters.get('end_date') if not end_date: end_date = now() full_interval = (end_date - start_date).total_seconds() ratio = (self.timeline_min_date - utc(start_date)).total_seconds() / full_interval if ratio < 0: # min_date < start_date, all tweets fetched self._progress = 100 elif 0 <= ratio <= 1: # start_date < min_date < end_date, in progress self._progress = round((1.0 - ratio) * 100) else: # min_date > end_date, zero tweets fetched self._progress = 0 return self._progress
def get_timeslot_index(self, item): for idx in xrange(self.NUM_TIMERANGE_SLOTS): if hasattr(item, 'created_at') and utc(item.created_at) > self._cached_from_date + timedelta(hours=self.time_increment * idx): continue else: break return datetime_to_timestamp_ms(self._cached_from_date + timedelta(hours=self.time_increment * idx))
def accept_sync(self): from solariat_bottle.db.user import get_user user = get_user() account = user.account if self.sync_status != self.SYNCED: raise ImproperStateError(self) coll = self.data_sync_coll count = coll.count() if not count: raise ImproperStateError('Cannot accept sync on 0 items.') account.reload() if account.event_processing_lock: raise ImproperStateError('Cannot accept sync now, global ' 'events re-processing is in progress.') account.update(event_processing_lock=True) bulk_insert = self.all_data_coll.initialize_unordered_bulk_op() for doc in self.data_sync_coll.find(): bulk_insert.insert(doc) self.data_coll.remove() bulk_insert.execute() self.data_sync_coll.drop() postprocess_events.async(user) self.update(sync_status=self.IN_SYNC, updated_at=utc(now()), rows=count, sync_errors={})
def compute_customer_timeline(customer, from_dt, to_dt): def _get_platform(event): platform = event._t[0] if platform.endswith('Post') and platform != 'Post': platform = platform[:-len('Post')] return platform timeline_data = [] for monthly_slot in reversed( list(timeslot.gen_timeslots(from_dt, to_dt, 'month'))): _month_start, _month_end = timeslot.Timeslot(monthly_slot).interval _month_events_count = Event.objects.range_query_count( from_dt, to_dt, customer) if not _month_events_count: continue if _month_start.month == to_dt.month: month_label = 'This Month' elif _month_start.month == to_dt.month - 1: month_label = 'Last Month' else: month_label = _month_start.strftime('%B') timeline_data.append([month_label, []]) for daily_slot in reversed( list(timeslot.gen_timeslots(from_dt, to_dt, 'day'))): _day_start, _day_end = timeslot.Timeslot(daily_slot).interval _day_events = list( Event.objects.range_query(max(utc(from_dt), _day_start), min(utc(to_dt), _day_end), customer)) if not _day_events: continue day_label = _day_start.strftime('%b %d') timeline_data[-1][-1].append([day_label, []]) grouper = itertools.groupby(_day_events, _get_platform) for platform, platform_events in grouper: _events = list(platform_events) event_interval_ids = (str(_events[0].id), str(_events[-1].id)) timeline_data[-1][-1][-1][-1].append( (platform, len(_events), event_interval_ids)) return customer, timeline_data
def get_csv_input_file(self, size=10): now_ts = datetime_to_timestamp(utc(now())) csv_file = tempfile.TemporaryFile('w+') writer = csv.writer(csv_file, delimiter=',') writer.writerow(self.TEST_ITEM_COLUMNS) for _ in xrange(size): writer.writerow(self._gen_item_values(now_ts)) csv_file.flush() csv_file.seek(0) return csv_file
def on_connect(self): """Emitted by Stream when connected successfully""" self.log_event('connect') self.last_keep_alive = now() was_offline = self.db.last_online(self.stream_id) since = None if was_offline: _, _, since, self.last_status_id = was_offline self.set_event(Events.EVENT_ONLINE) MAX_OFFLINE_TIME = 1.5 * 24 * 60 * 60 # 36 hours too_old = not since or (now() - utc(since)).total_seconds() > MAX_OFFLINE_TIME if not was_offline or too_old: return filters = { "start_date": since and utc(since), "end_date": self.last_keep_alive and utc(self.last_keep_alive), "since_id": self.last_status_id } LOGGER.info(u'[%s] fetch offline direct messages with filters %s' % (self.stream_id, filters)) from solariat_bottle.daemons.twitter.historics.timeline_request import DirectMessagesFetcher from solariat_bottle.utils.tweet import TwitterApiWrapper from solariat_bottle.daemons.twitter.parsers import DMTweetParser try: api = TwitterApiWrapper(auth_tuple=self.auth).api fetcher = DirectMessagesFetcher(api, **filters) parse = DMTweetParser() for dm in fetcher.fetch(): self.on_direct_message(parse(dm)) else: LOGGER.info(u'[%s] no offline direct messages' % (self.stream_id, )) except: LOGGER.exception(u"[%s] fetch offline direct messages failed" % self.stream_id)
def _invalidate(timeout=default_timeout, value_attr='_cached_facebook_me', ts_attr='_cached_facebook_me_ts', value_getter=_graph_me): date_now = now() if not getattr(self, ts_attr) or (date_now - utc(getattr(self, ts_attr))).total_seconds() > timeout: self.update(**{ts_attr: date_now, value_attr: json.dumps(value_getter())}) return json.loads(getattr(self, value_attr))
def update_schema(self, schema_json): if self.is_locked: raise ImproperStateError(self) self._clean_schema(schema_json) self._validate_schema(schema_json) if self.schema != schema_json: self.update(sync_status=self.OUT_OF_SYNC, schema=schema_json, updated_at=utc(now())) return True
def setUp(self): UICase.setUp(self) self.login() first_date = utc(datetime(2012, 1, 1)) post1 = self._create_db_post(_created=first_date, content='i need some carrot') self.assertEqual( Post.objects(channels__in=[self.channel.id]).count(), 1) # 1 jan + 10 minutes second_date = first_date + timedelta(minutes=10) post2 = self._create_db_post(_created=second_date, content='where i can buy a carrot?') self.assertEqual( Post.objects(channels__in=[self.channel.id]).count(), 2) # 1 jan + 7 days third_date = first_date + timedelta(minutes=7 * 60 * 24) post3 = self._create_db_post(_created=third_date, content='i need some carrot') self.assertEqual( Post.objects(channels__in=[self.channel.id]).count(), 3) forth_date = third_date + timedelta(minutes=10) post4 = self._create_db_post(_created=forth_date, content='where i can buy a carrot?') self.assertEqual( Post.objects(channels__in=[self.channel.id]).count(), 4) # This will not be created, only for stats post5 = Post(channels=[self.channel.id], content='LOL', actor_id=post4.user_profile.id, is_inbound=True, _native_id='1', _created=post4._created) self.assertEqual( Post.objects(channels__in=[self.channel.id]).count(), 4) no_post_created(post5, utc(forth_date + timedelta(minutes=10))) self.now = now()
def fb_get_private_messages(channel, page_id, user, since, until): from solariat_bottle.db.post.utils import factory_by_user from solariat_bottle.daemons.facebook.facebook_data_handlers import FBDataHandlerFactory from solariat_bottle.daemons.facebook.facebook_history_scrapper import FacebookHistoryScrapper puller = FacebookHistoryScrapper(channel, user=user) message_threads = puller.get_page_private_messages(page_id, since, until)['data'] for thread in message_threads: if 'messages' in thread and thread['messages']['data']: data = thread['messages'][ 'data'][::-1] #get array of messages in reversed order root_message_id = data[0]['id'] if thread['id'] in channel.tracked_fb_message_threads_ids: check = lambda msg, since: utc( parser.parse(msg['created_time'])) > utc(since) messages_to_handle = [msg for msg in data if check(msg, since)] else: channel.tracked_fb_message_threads_ids.append(thread['id']) channel.save() messages_to_handle = data conversation_id = thread['id'] for msg in messages_to_handle: if msg['id'] != root_message_id: msg['root_post'] = root_message_id msg['conversation_id'] = conversation_id msg['page_id'] = page_id msg = puller.handle_data_item( msg, FBDataHandlerFactory.get_instance(FBDataHandlerFactory.PM), thread['id']) factory_by_user(user, sync=True, **msg)
def get_json_input_file(self, size=10, event_types=None): # schema: json_file = tempfile.TemporaryFile('w+') now_ts = datetime_to_timestamp(utc(now())) res = [] for _ in xrange(size): data_item = dict(zip(self.TEST_ITEM_COLUMNS, self._gen_item_values(now_ts))) data_item.update({self.EVENT_TYPE_DATA_FIELD: random.choice(event_types)}) res.append(data_item) json.dump(res, json_file) # json_file.write(json.dump(res)) json_file.flush() json_file.seek(0) return json_file
def handle_rate_limit_error(self, error, path, failed_request_time, log_item): manager = FacebookRateLimitInfo.objects error_code = self._parse_fb_error_code(error) if error_code not in FB_RATE_LIMIT_ERRORS: return None last_rate_limit_info = manager.get_last_rate_limit_info( self.access_token, error_code, path) back_off_config = FacebookRateLimitInfo.LIMITS_CONFIG[error_code] if last_rate_limit_info: last_wait_time = last_rate_limit_info.wait_time wait_time = timedelta(seconds=min( back_off_config.end, last_wait_time * back_off_config.factor)) if utc(last_rate_limit_info.failed_request_time + wait_time) > utc(failed_request_time): wait_until = last_rate_limit_info.failed_request_time + wait_time else: wait_until = timedelta(seconds=back_off_config.start) + failed_request_time else: wait_until = timedelta(seconds=back_off_config.start) + failed_request_time after = last_rate_limit_info and last_rate_limit_info.wait_until return manager.add_rate_limit_info( self.access_token, error_code, utc(failed_request_time), path, utc(wait_until), str(self._channel), after, log_item)
def test_topics_search_range(self): content = "I need a laptop @screen_name test" _created = utc(datetime.now()) post = self._create_db_post(content=content, _created=_created) topics_yesterday = self.topics_search_range(_created - timedelta(days=1)) eq_(topics_yesterday, []) topics_today = self.topics_search_range(_created) eq_(topics_today[0]['topic_count'], 1) topics_tomorrow = self.topics_search_range(_created + timedelta(days=1)) eq_(topics_tomorrow, [])
def accept_sync(self): if self.sync_status != self.SYNCED: raise ImproperStateError(self) coll = self.data_sync_coll count = coll.count() # if not count: # raise ImproperStateError('Cannot accept sync on 0 items.') if count: coll.rename(self.mongo_collection, dropTarget=True) self.update(sync_status=self.IN_SYNC, updated_at=utc(now()), rows=count, sync_errors={}) if count > 10000: # For bigger collections try and create indexes self.create_indexes()
def is_enabled(channel_id): from solariat_bottle.db.channel.base import Channel, ServiceChannel global CACHE_UPDATE, CHANNELS_ENABLED_CACHE if not CACHE_UPDATE or datetime.utcnow() - CACHE_UPDATE > CACHE_EXPIRE_IN: with cache_lock: LOGGER.debug('posts tracking: update cache') if not CACHE_UPDATE or datetime.utcnow( ) - CACHE_UPDATE > CACHE_EXPIRE_IN: channels = set() for ch in Channel.objects.find( status__in=['Active', 'Interim'], posts_tracking_enabled=True): if ch.posts_tracking_disable_at and now() > utc( ch.posts_tracking_disable_at): ch.update(posts_tracking_enabled=False) LOGGER.debug('Disabling post tracking for channel: ' + str(ch.id)) continue channels.add(ch.id) if isinstance(ch, ServiceChannel): channels.add(ch.inbound) channels.add(ch.outbound) CHANNELS_ENABLED_CACHE = channels CACHE_UPDATE = datetime.utcnow() if channel_id is None: return True if not channel_id: LOGGER.debug("post tracking: skip log for channel_id: %s" % channel_id) return None if not isinstance(channel_id, (list, tuple, set)): channel_id = [channel_id] try: ch_ids = { ch.id if isinstance(ch, Channel) else ObjectId(ch) for ch in channel_id } except TypeError: return set([]) return {ch for ch in ch_ids if ch in CHANNELS_ENABLED_CACHE}
def test_native_id(self): acc = self.user.account schema = [ {KEY_NAME: 'param', KEY_TYPE: TYPE_INTEGER} ] channel_type = ChannelType.objects.create_by_user(self.user, name='TestNativeIdChType', account=self.user.account, schema=schema) ChClass = channel_type.get_channel_class() channel = ChClass.objects.create_by_user(self.user, title='TestNativeIdChannel', channel_type_id=channel_type.id) event_type = acc.event_types.create(self.user, channel_type, 'TestNativeId') SCHEMA = [ {KEY_NAME: 'name', KEY_TYPE: TYPE_STRING}, {KEY_NAME: 'level', KEY_TYPE: TYPE_INTEGER}, {KEY_NAME: 'origin_id', KEY_TYPE: TYPE_STRING, KEY_IS_NATIVE_ID: True}, ] # TODO: actor_id is hardcoded DATA = [ {'name': 'James Bond', 'level': 7, 'origin_id': '007', 'actor_id': 1}, {'name': 'Archer', 'level': 8, 'origin_id': 'duchess', 'actor_id': 1}, {'name': 'James Bond', 'level': 7, 'origin_id': '007', 'actor_id': 1}, # duplicate ] event_type.update_schema(SCHEMA) event_type.update(sync_status=EventType.IN_SYNC) start = utc(now()) acc.event_types.import_data(self.user, channel, event_type, ListDataLoader(DATA)) manager = event_type.get_data_class().objects self.assertEqual(manager.count(), 2) DATA_2 = [ {'name': 'Archer', 'level': 8, 'origin_id': 'duchess', 'actor_id': 1}, # duplicate {'name': 'David Webb', 'level': 10, 'origin_id': 'jason_bourne', 'actor_id': 1}, ] acc.event_types.import_data(self.user, channel, event_type, ListDataLoader(DATA_2)) self.assertEqual(manager.count(), 3)
def post(self, user, **kwargs): profile_data = kwargs.pop('nps_profile') nps_profile = NPSProfile.objects.get_or_create(**profile_data) if 'actor_id' not in kwargs: # TODO: Why anonymous customer profile created here? CustomerProfile = user.account.get_customer_profile_class() customer_profile = CustomerProfile(account_id=user.account.id) customer_profile.add_profile(nps_profile) # customer_profile = nps_profile.customer_profile kwargs['actor_id'] = customer_profile.id kwargs['is_inbound'] = True kwargs['_created'] = utc( datetime.strptime(kwargs['_created'], DATE_FORMAT)) if 'score' not in kwargs: kwargs.pop('response_type') data = NPSPost.objects.create_by_user(user, **kwargs) else: data = NPSOutcome.objects.create_by_user(user, user_profile=nps_profile, **kwargs) return data.to_dict()