def test_fetchable(self): fctrl = FeedController() total = fctrl.read().count() unix = datetime(1970, 1, 1).replace(tzinfo=timezone.utc) count = 0 for fd in fctrl.list_late(): count += 1 self.assertEqual(unix, fd.last_retrieved) self.assertEqual(unix, fd.expires) self.assertEqual(total, count) fetchables = fctrl.list_fetchable() now = utc_now() for fd in fetchables: self.assert_in_range(now - timedelta(seconds=1), fd.last_retrieved, now) self.assertEqual(unix, fd.expires) self.assert_late_count( 0, "no late feed to report because all just fetched") fctrl.update({}, {'expires': unix}) now = utc_now() for fd in fctrl.read(): # expires should be corrected self.assert_in_range( now + timedelta(seconds=conf.feed.min_expires - 1), fd.expires, now + timedelta(seconds=conf.feed.min_expires + 1)) lr_not_matter = timedelta(seconds=conf.feed.min_expires + 10) self.update_all_no_ctrl(expires=utc_now() - timedelta(seconds=1), last_retrieved=utc_now() - lr_not_matter) self.assert_late_count(total, "all feed just expired") self.update_all_no_ctrl(expires=utc_now() + timedelta(seconds=1)) self.assert_late_count( 0, "all feed will expire in a second, none are expired")
def metrics_users_long_term(): logger.debug('Counting long term users') threshold_connection = utc_now() - timedelta(days=conf.feed.stop_fetch) threshold_connection = utc_now() - timedelta(days=conf.feed.stop_fetch) threshold_created = utc_now() - timedelta(days=conf.feed.stop_fetch + 1) long_term = UserController().read(is_active=True, last_connection__ge=threshold_connection, date_created__lt=threshold_created) USER.labels(status='long_term').set(long_term.count())
def test_extract_max_age(): max_age = conf.feed.max_expires / 2 headers = {'cache-control': 'garbage max-age=%d garbage' % max_age} assert_in_range( extract_feed_info(headers)['expires'], utc_now() + timedelta(seconds=max_age)) headers['expires'] = rfc_1123_utc(delta=timedelta(hours=12)) assert_in_range( extract_feed_info(headers)['expires'], utc_now() + timedelta(seconds=max_age))
def update_slow_metrics(): uctrl = UserController() USER.labels(status='any').set(uctrl.read().count()) threshold_connection = utc_now() - timedelta(days=conf.feed.stop_fetch) threshold_created = utc_now() - timedelta(days=conf.feed.stop_fetch + 1) active = uctrl.read(is_active=True, last_connection__ge=threshold_connection) USER.labels(status='active').set(active.count()) long_term = uctrl.read(is_active=True, last_connection__ge=threshold_connection, date_created__lt=threshold_created) USER.labels(status='long_term').set(long_term.count())
def set_feed_error(self, error=None, parsed_feed=None): error_count = self.feed.error_count + 1 if error: last_error = str(error) elif parsed_feed: last_error = str(parsed_feed.get('bozo_exception', '')) if self.feed.error_count > conf.feed.error_threshold: level = logging.WARNING else: level = logging.DEBUG logger.log(level, "%r: fetching feed error'd; error count -> %r", self.feed, error_count) logger.debug("%r: last error details %r", self.feed, last_error) now = utc_now() info = { 'error_count': error_count, 'last_error': last_error, 'user_id': self.feed.user_id, 'last_retrieved': now, 'expires': None } # forcing compute by controller FEED_FETCH.labels(feed_type=self.feed.feed_type.value, result='error').inc() return FeedController().update({'id': self.feed.id}, info)
def test_ListFeedResource_get(self): resp = self.jarr_client('get', 'feeds') self.assertStatusCode(401, resp) feeds_u1 = self.jarr_client('get', 'feeds', user='******').json feeds_u2 = self.jarr_client('get', 'feeds', user='******').json feeds_u1 = [f['id'] for f in feeds_u1] feeds_u2 = [f['id'] for f in feeds_u2] self.assertFalse(set(feeds_u1).intersection(feeds_u2)) # testing time formating feed = self.jarr_client('get', 'feeds', user='******').json[0] now = utc_now() FeedController().update({'id': feed['id']}, {'last_retrieved': now}) json = self._get(feed['id'], 'user1') self.assertEqual(json['last_retrieved'], now.isoformat()) FeedController().update({'id': feed['id']}, {'last_retrieved': now.replace(tzinfo=None)}) json = self._get(feed['id'], 'user1') self.assertEqual(json['last_retrieved'], now.isoformat()) FeedController().update({'id': feed['id']}, {'last_retrieved': now.astimezone(timezone(timedelta(hours=12)))}) json = self._get(feed['id'], 'user1') self.assertEqual(json['last_retrieved'], now.isoformat())
def list_late(self, limit=0): """Will list either late feeds or feeds with articles recently created. Late feeds are feeds which have been retrieved for the last time sooner than now minus the delta (default to 1h). The others are feeds with article created later than now minus a quarter the delta (default to 15 logically). The idea is to keep very active feed up to date and to avoid missing articles du to high activity (when, for example, the feed only displays its 30 last entries and produces more than one per minutes). Feeds of inactive (not connected for more than a month) or manually desactivated users are ignored. """ now = utc_now() min_expiring = now - timedelta(seconds=conf.feed.min_expires) max_expiring = now - timedelta(seconds=conf.feed.max_expires) filters = self._to_filters(last_retrieved__lt=min_expiring, __or__=[{ 'expires__lt': now, 'expires__ne': None }, { 'last_retrieved__lt': max_expiring, 'last_retrieved__ne': None }]) query = self.get_active_feed().filter(*filters).order_by(Feed.expires) if limit: query = query.limit(limit) yield from query
def template_article(self): return { 'feed_id': self.feed.id, 'category_id': self.feed.category_id, 'user_id': self.feed.user_id, 'retrieved_date': utc_now() }
def _extract_max_age(headers, feed_info): if 'max-age' in headers.get('cache-control', ''): try: max_age = int(MAX_AGE_RE.search(headers['cache-control']).group(1)) feed_info['expires'] = utc_now() + timedelta(seconds=max_age) except Exception: logger.exception("something went wrong while parsing max-age")
def count_by_user_id(self, **filters): conn_max = utc_now() - timedelta(days=30) return dict(session.query(Article.user_id, func.count(Article.id)) .filter(*self._to_filters(**filters)) .join(User).filter(User.is_active.__eq__(True), User.last_connection >= conn_max) .group_by(Article.user_id).all())
def test_similarity_clustering(self): cluster_conf = {'tfidf_min_score': 0.6, 'tfidf_min_sample_size': 10} user = Mock(cluster_conf=cluster_conf) category = Mock(cluster_conf=cluster_conf) feed = Mock(cluster_conf=cluster_conf, user=user, category=category) cluster = Mock() def gen_articles(factor): return [Mock(simple_vector={'Sarkozy': 1, 'garb': 1, 'justice': 1}, feed=feed, cluster=cluster)] \ + [Mock(feed=feed, simple_vector={'Sark': 1, 'garbge': 1, 'vote': 1}), Mock(feed=feed, simple_vector={'Sark': 1, 'garbae': 1, 'debat': 1}), Mock(feed=feed, simple_vector={'Sark': 1, 'garbag': 1, 'blague': 1}), Mock(feed=feed, simple_vector={'Sark': 1, 'garage': 1, 'chans': 1})] \ * factor ccontr = ClusterController() ccontr._get_query_for_clustering = Mock(return_value=gen_articles(2)) matching_article = Mock(simple_vector={ 'Morano': 1, 'garb': 1, 'justice': 1 }, date=utc_now(), lang='fr', feed=feed) self.assertIsNone(ccontr._get_cluster_by_similarity(matching_article)) ccontr._get_query_for_clustering = Mock(return_value=gen_articles(100)) self.assertEqual(ccontr._get_cluster_by_similarity(matching_article), cluster) solo_article = Mock(simple_vector={ 'Sark': 1, 'fleur': 1 }, date=utc_now(), lang='fr', feed=feed) self.assertNotEqual(cluster, ccontr._get_cluster_by_similarity(solo_article)) self.assertIsNone(ccontr._get_cluster_by_similarity(solo_article))
def get_active_feed(self, **filters): filters['error_count__lt'] = conf.feed.error_max query = self.read(status=FeedStatus.active, **filters) if conf.feed.stop_fetch: last_conn = utc_now() - timedelta(days=conf.feed.stop_fetch) return query.join(User).filter(User.is_active.__eq__(True), User.last_connection >= last_conn) return query
def populate_db(): fcontr = FeedController() ccontr = CategoryController() UserController().create( **{ 'is_admin': True, 'is_api': True, 'cluster_enabled': False, 'login': '******', 'password': '******' }) user1, user2 = [ UserController().create(login=name, cluster_enabled=False, email="*****@*****.**" % name, password=name) for name in ["user1", "user2"] ] for iteration in range(2): article_total = 0 for user in (user1, user2): for iter_cat in range(3): cat_id = None if iter_cat: cat_id = ccontr.create(user_id=user.id, name=to_name( user, iteration, iter_cat)).id feed_id = fcontr.create( link="feed%d%d" % (iteration, iter_cat), user_id=user.id, category_id=cat_id, title=to_name(user, iteration, iter_cat, iter_cat)).id for iter_art in range(3): entry = to_name(user, iteration, iter_cat, iter_cat, iter_art) tags = [ to_name(user, iteration, iter_cat, iter_cat, iter_art, str(i)) for i in range(2) ] article_total += 1 ArticleController().create( entry_id=entry, link='http://test.te/%d' % article_total, feed_id=feed_id, user_id=user.id, tags=tags, category_id=cat_id, title=entry, date=utc_now() + timedelta(seconds=iteration), content="content %d" % article_total) session.commit() session.flush() ClusterController().clusterize_pending_articles()
def get(): """Given valid credentials, will provide a token to request the API.""" jwt = current_app.extensions["jwt"] user = UserController(current_identity.id).get(id=current_identity.id) access_token = jwt.jwt_encode_callback(user).decode("utf8") UserController(user.id).update({"id": user.id}, {"last_connection": utc_now(), "renew_password_token": ""}) SERVER.labels(method="get", uri="/auth/refresh", result='2XX').inc() return {"access_token": "%s %s" % (conf.auth.jwt_header_prefix, access_token)}, 200
def list_fetchable(self, limit=0): now, feeds = utc_now(), list(self.list_late(limit)) if feeds: for feed in feeds: if feed.last_retrieved == UNIX_START: continue FEED_LATENESS.labels(feed_type=feed.feed_type.value)\ .observe((now - feed.last_retrieved).total_seconds()) self.update({'id__in': [feed.id for feed in feeds]}, {'last_retrieved': now}) return feeds
def __update_default_expires(self, feed, attrs): now = utc_now() min_delta = timedelta(seconds=conf.feed.min_expires) max_delta = timedelta(seconds=conf.feed.max_expires) min_expires = now + min_delta max_expires = now + max_delta method = 'from header' feed_type = getattr(feed.feed_type, 'value', '') if attrs['expires'] is None: attrs['expires'] = max_expires method = 'defaulted to max' try: if not isinstance(attrs['expires'], datetime): attrs['expires'] = dateutil.parser.parse(attrs['expires']) if not attrs['expires'].tzinfo: method = 'from header added tzinfo' attrs['expires'] = attrs['expires'].replace( tzinfo=timezone.utc) elif max_expires < attrs['expires']: method = 'from header max limited' attrs['expires'] = max_expires logger.debug("%r expiring too late, forcing expire in %ds", feed, conf.feed.max_expires) elif attrs['expires'] < min_expires: method = 'from header min limited' attrs['expires'] = min_expires logger.debug("%r expiring too early, forcing expire in %ds", feed, conf.feed.min_expires) except Exception: attrs['expires'] = max_expires method = 'defaulted to max' art_count = self.__actrl.read(feed_id=feed.id, retrieved_date__gt=now - max_delta * SPAN_FACTOR).count() if not art_count and method == 'from header min limited': attrs['expires'] = now + 2 * min_delta method = 'no article, twice min time' elif art_count: proposed_expires = now + max_delta / art_count / SPAN_FACTOR if min_expires < proposed_expires < attrs['expires']: attrs['expires'] = proposed_expires method = 'computed' if proposed_expires < min_expires: method = 'many articles, set to min expire' attrs['expires'] = min_expires exp_s = (attrs['expires'] - now).total_seconds() logger.info('%r : %d articles, expiring in %ds (%s)', feed, art_count, exp_s, method) FEED_EXPIRES.labels(method=method, feed_type=feed_type).observe(exp_s)
def construct(self, entry): self.article = self.template_article() if not entry: return self.article['entry_id'] = self.extract_id(entry) try: self.article['date'] = self.extract_date(entry) except Exception: self.article['date'] = utc_now() self.article['title'] = self.extract_title(entry) self.article['tags'] = self.extract_tags(entry) self.article['link'] = self.extract_link(entry) self.article['content'] = self.extract_content(entry) self.article['lang'] = self.extract_lang(entry) self.article['comments'] = self.extract_comments(entry)
def post(): """Given valid credentials, will provide a token to request the API.""" attrs = login_parser.parse_args() jwt = current_app.extensions["jwt"] user = jwt.authentication_callback(attrs["login"], attrs["password"]) if not user: SERVER.labels(method="post", uri="auth", result='4XX').inc() raise Forbidden() access_token = jwt.jwt_encode_callback(user).decode("utf8") UserController(user.id).update({"id": user.id}, {"last_connection": utc_now(), "renew_password_token": ""}) SERVER.labels(method="post", uri="/auth", result='2XX').inc() return {"access_token": "%s %s" % (conf.auth.jwt_header_prefix, access_token)}, 200
def get(): user_id = current_identity.id user = UserController(user_id).get(id=user_id) categories = { cat.id: cat for cat in CategoryController(user_id).read() } response = make_response( render_template('opml.xml', user=user, categories=categories, feeds=FeedController(user_id).read(), now=utc_now())) for key, value in OK_GET_HEADERS.items(): response.headers[key] = value return response
def clean_feed(self, response, **info): """Will reset the errors counters on a feed that have known errors""" now = utc_now() info.update({'error_count': 0, 'last_error': None, 'last_retrieved': now, 'expires': None}) info.update(extract_feed_info(response.headers, response.text)) feed_permanently_redirected = response.history \ and self.feed.link != response.url \ and any(r.status_code in {301, 308} for r in response.history) if feed_permanently_redirected: logger.warning('%r: feed moved from %r to %r', self.feed, self.feed.link, response.url) info['link'] = response.url if info: FeedController(self.feed.user_id).update({'id': self.feed.id}, info)
def reset_feeds(): """Will reschedule all active feeds to be fetched in the next two hours""" fcontr = FeedController(ignore_context=True) now = utc_now() feeds = [ feed[0] for feed in fcontr.get_active_feed().with_entities(fcontr._db_cls.id) ] step = timedelta(seconds=conf.feed.max_expires / len(feeds)) for i, feed_id in enumerate(feeds): fcontr.update( {'id': feed_id}, { 'etag': '', 'last_modified': '', 'last_retrieved': datetime(1970, 1, 1, tzinfo=timezone.utc), 'expires': now + i * step })
def construct(self, entry): self.article = self.template_article() if not entry: return self.article['entry_id'] = self.extract_id(entry) try: self.article['date'] = self.extract_date(entry) except Exception: self.article['date'] = utc_now() self.article['title'] = self.extract_title(entry) self.article['tags'] = self.extract_tags(entry) self.article['link'] = self.extract_link(entry) self.article['content'] = self.extract_content(entry) self.article['lang'] = clean_lang(self.extract_lang(entry)) self.article['comments'] = self.extract_comments(entry) if self.article.get('link'): self.article['link_hash'] = self.to_hash(self.article['link']) if self.article.get('content'): self.article['content'] = clean_urls(self.article['content'], self.article['link'])
def test_scheduler(self): scheduler() UserController().update({}, {'last_connection': utc_now()}) fctrl = FeedController() epoch = datetime(1970, 1, 1, tzinfo=timezone.utc) self.assertEqual(fctrl.read().count(), self.process_feed_patch.apply_async.call_count) self.assertEqual(0, self.clusteriser_patch.apply_async.call_count) self.assertEqual(0, self.feed_cleaner_patch.apply_async.call_count) feed1, feed2, feed3 = list(FeedController().read().limit(3)) FeedController().update({'id__in': [feed1.id, feed3.id]}, {'status': 'to_delete'}) FeedController().update({'id': feed2.id}, { 'last_retrieved': epoch, 'expires': epoch }) self.assertEqual(1, len(list(fctrl.list_fetchable()))) scheduler() self.assertEqual(fctrl.read().count(), self.process_feed_patch.apply_async.call_count) self.assertEqual(0, self.clusteriser_patch.apply_async.call_count) self.assertEqual(1, self.feed_cleaner_patch.apply_async.call_count)
def process_ids(cls, social_id, username, email): # pragma: no cover labels = {"method": "get", "uri": "/oauth/callback/" + cls.provider} if social_id is None: SERVER.labels(result="4XX", **labels).inc() raise UnprocessableEntity('No social id, authentication failed') ucontr = UserController() try: user = ucontr.get(**{'%s_identity' % cls.provider: social_id}) except NotFound: user = None if not user and not conf.oauth.allow_signup: SERVER.labels(result="4XX", **labels).inc() raise BadRequest('Account creation is not allowed through OAuth.') if not user: if username and not ucontr.read(login=username).count(): login = username else: login = '******' % (cls.provider, username or social_id) user = ucontr.create( **{ '%s_identity' % cls.provider: social_id, 'login': login, 'email': email }) ucontr.update({"id": user.id}, { "last_connection": utc_now(), "renew_password_token": "" }) jwt_ext = current_app.extensions['jwt'] access_token = jwt_ext.jwt_encode_callback(user).decode('utf8') SERVER.labels(result="2XX", **labels).inc() return { "access_token": "%s %s" % (conf.auth.jwt_header_prefix, access_token) }, 200
def test_fetching_anti_herding_mech_utcplustwelve(self): self._test_fetching_anti_herding_mech(utc_now().astimezone( timezone(timedelta(hours=12))))
def extract_date(entry): published = entry.get('date_published') if published: return dateutil.parser.parse(published).astimezone(timezone.utc) return utc_now()
def list_active(self): last_conn = utc_now() - timedelta(days=conf.feed.stop_fetch) return self.read(is_active=True, last_connection__ge=last_conn)
def _filter_unclustered(*fields): conn_max = utc_now() - timedelta(days=conf.feed.stop_fetch) return (session.query(*fields).filter( Article.cluster_id.__eq__(None)).join(User).filter( User.id == Article.user_id, User.is_active.__eq__(True), User.last_connection >= conn_max))
def test_fetching_anti_herding_mech_utctimezone(self): self._test_fetching_anti_herding_mech(utc_now())
def metrics_users_active(): logger.debug('Counting active users') threshold_connection = utc_now() - timedelta(days=conf.feed.stop_fetch) active = UserController().read(is_active=True, last_connection__ge=threshold_connection) USER.labels(status='active').set(active.count())