Exemple #1
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched", feed)
        process_feed.apply_async(args=[feed.id])
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
            break  # only one at a time
    # applying clusterizer
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if not UserController().get(id=user_id).effectivly_active:
            continue
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            clusterizer.apply_async(args=[user_id])
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    WORKER.labels(method='scheduler').observe(
        (datetime.now() - start).total_seconds())
    update_slow_metrics.apply_async()
Exemple #2
0
    def test_articles_with_enclosure_and_fetched_content(self, truncated_cnt,
                                                         get_vector):
        self._clean_objs()
        get_vector.return_value = None
        truncated_cnt.return_value = {'type': 'fetched',
                                      'title': 'holy grail',
                                      'content': 'blue, no read, aaah',
                                      'link': 'https://monthy.python/brian'}
        feed = FeedController().read().first()
        FeedController().update({'id': feed.id},
                                {'truncated_content': True,
                                 'cluster_enabled': True})
        UserController().update({'id': feed.user_id},
                                {'cluster_enabled': True})

        builder = ClassicArticleBuilder(feed, self.entry_w_enclosure, {})
        self.assertIsNone(builder.article.get('article_type'))
        raw_articles = list(builder.enhance())
        self.assertEqual(2, len(raw_articles))
        self.assertEqual('audio', raw_articles[1]['article_type'].value)
        articles = []
        for raw_article in raw_articles:
            articles.append(
                ArticleController(feed.user_id).create(**raw_article))
        ClusterController(feed.user_id).clusterize_pending_articles()
        a1 = ArticleController().get(id=articles[0].id)
        a2 = ArticleController().get(id=articles[1].id)
        self.assertEqual(a1.cluster_id, a2.cluster_id)
        cluster = ClusterController().get(id=a1.cluster_id)
        self.assertEqual(2, cluster.content['v'])
        self.assertEqual(1, len(cluster.content['contents']))
        self.assertEqual('fetched', cluster.content['contents'][0]['type'])
Exemple #3
0
    def test_ListFeedResource_get(self):
        resp = self.jarr_client('get', 'feeds')
        self.assertStatusCode(401, resp)
        feeds_u1 = self.jarr_client('get', 'feeds', user='******').json
        feeds_u2 = self.jarr_client('get', 'feeds', user='******').json
        feeds_u1 = [f['id'] for f in feeds_u1]
        feeds_u2 = [f['id'] for f in feeds_u2]

        self.assertFalse(set(feeds_u1).intersection(feeds_u2))

        # testing time formating
        feed = self.jarr_client('get', 'feeds', user='******').json[0]
        now = utc_now()
        FeedController().update({'id': feed['id']}, {'last_retrieved': now})
        json = self._get(feed['id'], 'user1')
        self.assertEqual(json['last_retrieved'], now.isoformat())

        FeedController().update({'id': feed['id']},
                {'last_retrieved': now.replace(tzinfo=None)})
        json = self._get(feed['id'], 'user1')
        self.assertEqual(json['last_retrieved'], now.isoformat())

        FeedController().update({'id': feed['id']},
                {'last_retrieved':
                    now.astimezone(timezone(timedelta(hours=12)))})
        json = self._get(feed['id'], 'user1')
        self.assertEqual(json['last_retrieved'], now.isoformat())
 def test_cluster_same_feed(self):
     article = ArticleController().read(category_id__ne=None).first()
     cluster = article.cluster
     # all is enabled, article in cluster
     update_on_all_objs(articles=cluster.articles,
                        cluster_enabled=True,
                        cluster_same_feed=True)
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertInCluster(article, cluster)
     # feed's disabled, won't cluster
     FeedController().update(
         {'id__in': [a.feed_id for a in cluster.articles]},
         {'cluster_same_feed': False})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertNotInCluster(article, cluster)
     # category's disabled, won't cluster
     FeedController().update(
         {'id__in': [a.feed_id for a in cluster.articles]},
         {'cluster_same_feed': None})
     CategoryController().update({'id': cluster.main_article.category.id},
                                 {'cluster_same_feed': False})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertNotInCluster(article, cluster)
     # user's disable, won't cluster
     CategoryController().update({'id': cluster.main_article.category.id},
                                 {'cluster_same_feed': None})
     UserController().update({'id': cluster.user_id},
                             {'cluster_same_feed': False})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertNotInCluster(article, cluster)
     # reenabling user, will cluster
     UserController().update({'id': cluster.user_id},
                             {'cluster_same_feed': True})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertInCluster(article, cluster)
Exemple #5
0
    def test_adding_to_cluster_by_link(self):
        ccontr = ClusterController()

        cluster = ccontr.read().first()
        ccontr.update({'id': cluster.id}, {
            'read': True,
            'read_reason': 'marked'
        })
        cluster = ccontr.get(id=cluster.id)
        self.assertTrue(cluster.read)
        article = cluster.articles[0]
        articles_count = len(cluster.articles)

        fcontr = FeedController(cluster.user_id)
        acontr = ArticleController(cluster.user_id)
        fcontr.update({'id': article.feed_id}, {'cluster_wake_up': True})
        feed = fcontr.read(id__ne=article.feed_id).first()
        update_on_all_objs(articles=[article],
                           feeds=[feed],
                           cluster_enabled=True)

        self._clone_article(acontr, article, feed)
        ccontr.clusterize_pending_articles()

        cluster = ccontr.get(id=cluster.id)
        self.assertEqual(articles_count + 1, len(cluster.articles))
        self.assertFalse(cluster.read)
Exemple #6
0
    def test_delete_main_cluster_handling(self):
        suffix = 'suffix'
        clu = ClusterController().get(id=10)
        acontr = ArticleController(clu.user_id)
        fcontr = FeedController(clu.user_id)
        old_title = clu.main_title
        old_feed_title, old_art_id = clu.main_feed_title, clu.main_article_id
        for art_to_del in acontr.read(link=clu.main_article.link,
                                      id__ne=clu.main_article.id):
            acontr.delete(art_to_del.id)

        other_feed = fcontr.read(id__ne=clu.main_article.feed_id).first()
        update_on_all_objs(articles=[clu.main_article],
                           feeds=[other_feed],
                           cluster_enabled=True)
        acontr.create(
            feed_id=other_feed.id,
            entry_id=clu.main_article.entry_id + suffix,
            link=clu.main_article.link,
            title=clu.main_article.title + suffix,
            content=clu.main_article.content + suffix,
            date=clu.main_article.date + timedelta(1),
            retrieved_date=clu.main_article.retrieved_date + timedelta(1),
        )

        ClusterController(clu.user_id).clusterize_pending_articles()
        clu = ClusterController().get(id=10)
        self.assertEqual(2, len(clu.articles))
        fcontr.delete(clu.main_article.feed_id)
        new_cluster = ClusterController(clu.user_id).get(id=clu.id)
        self.assertEqual(1, len(new_cluster.articles))
        self.assertNotEqual(old_title, new_cluster.main_title)
        self.assertNotEqual(old_feed_title, new_cluster.main_feed_title)
        self.assertNotEqual(old_art_id, new_cluster.main_article_id)
Exemple #7
0
 def setUp(self):
     super().setUp()
     login = '******'
     self.user = UserController().get(login=login)
     self.user2 = UserController().get(login='******')
     self.fctrl = FeedController(self.user.id)
     self.cctrl = CategoryController(self.user.id)
     self.uctrl = UserController()
Exemple #8
0
 def put(feed_id):
     """Update an existing feed."""
     fctrl = FeedController(current_identity.id)
     attrs = parse_meaningful_params(parser_edit)
     changed = fctrl.update({'id': feed_id}, attrs)
     if not changed:
         fctrl.assert_right_ok(feed_id)
     return None, 204
Exemple #9
0
def populate_db():
    fcontr = FeedController()
    ccontr = CategoryController()
    UserController().create(
        **{
            'is_admin': True,
            'is_api': True,
            'cluster_enabled': False,
            'login': '******',
            'password': '******'
        })
    user1, user2 = [
        UserController().create(login=name,
                                cluster_enabled=False,
                                email="*****@*****.**" % name,
                                password=name) for name in ["user1", "user2"]
    ]

    for iteration in range(2):
        article_total = 0

        for user in (user1, user2):
            for iter_cat in range(3):
                cat_id = None
                if iter_cat:
                    cat_id = ccontr.create(user_id=user.id,
                                           name=to_name(
                                               user, iteration, iter_cat)).id
                feed_id = fcontr.create(
                    link="feed%d%d" % (iteration, iter_cat),
                    user_id=user.id,
                    category_id=cat_id,
                    title=to_name(user, iteration, iter_cat, iter_cat)).id
                for iter_art in range(3):
                    entry = to_name(user, iteration, iter_cat, iter_cat,
                                    iter_art)

                    tags = [
                        to_name(user, iteration, iter_cat, iter_cat, iter_art,
                                str(i)) for i in range(2)
                    ]
                    article_total += 1
                    ArticleController().create(
                        entry_id=entry,
                        link='http://test.te/%d' % article_total,
                        feed_id=feed_id,
                        user_id=user.id,
                        tags=tags,
                        category_id=cat_id,
                        title=entry,
                        date=utc_now() + timedelta(seconds=iteration),
                        content="content %d" % article_total)

    session.commit()
    session.flush()
    ClusterController().clusterize_pending_articles()
Exemple #10
0
    def test_admin_update_cluster_on_change_title(self):
        feed = ClusterController(2).read()[0].main_article.feed
        for cluster in feed.clusters:
            self.assertEqual(feed.title, cluster.main_feed_title)
        FeedController().update({'id': feed.id}, {'title': 'updated title'})

        feed = FeedController().get(id=feed.id)
        self.assertEqual('updated title', feed.title)
        for cluster in feed.clusters:
            self.assertEqual(feed.title, cluster.main_feed_title)
    def test_cluster_enabled(self):
        ccontr = ClusterController()
        cluster = ccontr.read().first()
        feed = FeedController(cluster.user_id).read(
            category_id__ne=None,
            id__nin=[art.feed_id for art in cluster.articles]).first()
        category = feed.category

        # clustering works when all is true
        update_on_all_objs(articles=cluster.articles,
                           feeds=[feed],
                           cluster_enabled=True)
        article = self.create_article_from(cluster, feed)
        self.assertInCluster(article, cluster)

        # disabling on user desactivate all clustering by default
        update_on_all_objs(articles=cluster.articles,
                           feeds=[feed],
                           cluster_enabled=None)
        UserController().update({'id': cluster.user_id},
                                {'cluster_enabled': False})
        article = self.create_article_from(cluster, feed)
        self.assertNotInCluster(article, cluster)

        # disabling on article's feed prevents from clustering
        update_on_all_objs(articles=cluster.articles,
                           feeds=[feed],
                           cluster_enabled=True)
        FeedController().update({'id': feed.id}, {'cluster_enabled': False})
        article = self.create_article_from(cluster, feed)
        self.assertNotInCluster(article, cluster)

        # disabling on feed from cluster's articles prevents from clustering
        update_on_all_objs(articles=cluster.articles,
                           feeds=[feed],
                           cluster_enabled=True)
        FeedController().update(
            {'id__in': [a.feed_id for a in cluster.articles]},
            {'cluster_enabled': False})
        article = self.create_article_from(cluster, feed)
        self.assertNotInCluster(article, cluster)

        # disabling on article's category prevents from clustering
        CategoryController(cluster.user_id).update({'id': category.id},
                                                   {'cluster_enabled': False})
        article = self.create_article_from(cluster, feed)
        self.assertNotInCluster(article, cluster)

        update_on_all_objs(articles=cluster.articles,
                           feeds=[feed],
                           cluster_enabled=True)
        article = self.create_article_from(cluster, feed)
        self.assertInCluster(article, cluster)
Exemple #12
0
    def test_fetchable(self):
        fctrl = FeedController()
        total = fctrl.read().count()
        unix = datetime(1970, 1, 1).replace(tzinfo=timezone.utc)
        count = 0
        for fd in fctrl.list_late():
            count += 1
            self.assertEqual(unix, fd.last_retrieved)
            self.assertEqual(unix, fd.expires)
        self.assertEqual(total, count)

        fetchables = fctrl.list_fetchable()
        now = utc_now()
        for fd in fetchables:
            self.assert_in_range(now - timedelta(seconds=1), fd.last_retrieved,
                                 now)
            self.assertEqual(unix, fd.expires)
        self.assert_late_count(
            0, "no late feed to report because all just fetched")
        fctrl.update({}, {'expires': unix})
        now = utc_now()
        for fd in fctrl.read():  # expires should be corrected
            self.assert_in_range(
                now + timedelta(seconds=conf.feed.min_expires - 1), fd.expires,
                now + timedelta(seconds=conf.feed.min_expires + 1))

        lr_not_matter = timedelta(seconds=conf.feed.min_expires + 10)
        self.update_all_no_ctrl(expires=utc_now() - timedelta(seconds=1),
                                last_retrieved=utc_now() - lr_not_matter)
        self.assert_late_count(total, "all feed just expired")
        self.update_all_no_ctrl(expires=utc_now() + timedelta(seconds=1))
        self.assert_late_count(
            0, "all feed will expire in a second, none are expired")
Exemple #13
0
    def test_no_add_feed_skip(self):
        self.resp_status_code = 304
        self.assertEqual(BASE_COUNT, ArticleController().read().count())
        crawler()
        FeedController().update({}, {
            'filters': [{
                "type": "tag contains",
                "action on": "match",
                "pattern": "pattern5",
                "action": "skipped"
            }, {
                "type": "simple match",
                "action on": "match",
                "pattern": "pattern5",
                "action": "mark as read"
            }, {
                "type": "regex",
                "action on": "match",
                "pattern": "pattern5",
                "action": "skipped"
            }]
        })

        crawler()
        self.assertEqual(BASE_COUNT, ArticleController().read().count())
Exemple #14
0
    def set_feed_error(self, error=None, parsed_feed=None):
        error_count = self.feed.error_count + 1
        if error:
            last_error = str(error)
        elif parsed_feed:
            last_error = str(parsed_feed.get('bozo_exception', ''))
        if self.feed.error_count > conf.feed.error_threshold:
            level = logging.WARNING
        else:
            level = logging.DEBUG
        logger.log(level, "%r: fetching feed error'd; error count -> %r",
                   self.feed, error_count)
        logger.debug("%r: last error details %r", self.feed, last_error)
        now = utc_now()
        info = {
            'error_count': error_count,
            'last_error': last_error,
            'user_id': self.feed.user_id,
            'last_retrieved': now,
            'expires': None
        }  # forcing compute by controller

        FEED_FETCH.labels(feed_type=self.feed.feed_type.value,
                          result='error').inc()
        return FeedController().update({'id': self.feed.id}, info)
Exemple #15
0
def crawler():
    user_ids = set()
    for feed in FeedController().list_fetchable(limit=1):
        process_feed.apply(args=[feed.id])
        user_ids.add(feed.user_id)
    for user_id in user_ids:
        clusterizer.apply(args=[user_id])
Exemple #16
0
 def test_feed_and_article_deletion(self):
     ccontr = CategoryController(2)
     cat = ccontr.read().first()
     ccontr.delete(cat.id)
     self.assertEqual(0,
                      ArticleController().read(category_id=cat.id).count())
     self.assertEqual(0, FeedController().read(category_id=cat.id).count())
    def test_cluster_tfidf_control(self):
        article = ArticleController().read(category_id__ne=None).first()
        cluster = article.cluster

        # leaving one cluster with one article
        clu_ids = [c.id for c in ClusterController().read(id__ne=cluster.id)]
        art_ids = [
            a.id
            for a in ArticleController().read(id__ne=cluster.main_article_id)
        ]
        ArticleController().update({'id__in': art_ids}, {'cluster_id': None})
        for clu_id in clu_ids:
            ClusterController().delete(clu_id)
        for art_id in art_ids:
            ArticleController().delete(art_id)
        self.assertEqual(1, ClusterController().read().count())
        self.assertEqual(1, ArticleController().read().count())

        feed1 = FeedController(cluster.user_id).create(
            title='new feed',
            cluster_conf={
                'tfidf_min_score': -1,
                'tfidf_min_sample_size': 1
            })
        update_on_all_objs(articles=cluster.articles,
                           feeds=[feed1],
                           cluster_tfidf_enabled=True,
                           cluster_enabled=True)
        feed2 = FeedController(cluster.user_id).create(
            cluster_enabled=True,
            cluster_tfidf_enabled=False,
            title='new feed',
            cluster_conf={
                'tfidf_min_score': -1,
                'tfidf_min_sample_size': 1
            })

        article = self.create_article_from(cluster,
                                           feed1,
                                           link=cluster.main_article.link +
                                           'do not match link')
        self.assertInCluster(article, cluster, ClusterReason.tf_idf)
        article = self.create_article_from(cluster,
                                           feed2,
                                           link=cluster.main_article.link +
                                           'do not match link either')
        self.assertNotInCluster(article, cluster)
Exemple #18
0
    def _test_fetching_anti_herding_mech(self, now):
        fctrl = FeedController()
        total = fctrl.read().count()

        half = timedelta(seconds=conf.feed.min_expires / 2)
        twice = timedelta(seconds=conf.feed.min_expires * 2)
        long_ago = timedelta(seconds=conf.feed.max_expires * 2)

        self.update_all_no_ctrl(expires=now + half, last_retrieved=now)
        self.assert_late_count(0, "all have just been retrieved, none expired")
        self.update_all_no_ctrl(expires=now - twice, last_retrieved=now - half)
        self.assert_late_count(0, "have been retrieved not too long ago")

        self.update_all_no_ctrl(expires=now + twice,
                                last_retrieved=now - long_ago)
        self.assert_late_count(total,
                               "all retrieved some time ago, not expired")
Exemple #19
0
    def post():
        opml_file = request.files['opml_file']

        try:
            subscriptions = opml.from_string(opml_file.read())
        except Exception as error:
            raise UnprocessableEntity("Couldn't parse OPML file (%r)" % error)

        ccontr = CategoryController(current_identity.id)
        fcontr = FeedController(current_identity.id)
        counts = {'created': 0, 'existing': 0, 'failed': 0, 'exceptions': []}
        categories = {cat.name: cat.id for cat in ccontr.read()}
        for line in subscriptions:
            try:
                link = line.xmlUrl
            except Exception as error:
                counts['failed'] += 1
                counts['exceptions'].append(str(error))
                continue

            # don't import twice
            if fcontr.read(link=link).count():
                counts['existing'] += 1
                continue

            # handling categories
            cat_id = None
            category = getattr(line, 'category', '').lstrip('/')
            if category:
                if category not in categories:
                    new_category = ccontr.create(name=category)
                    categories[new_category.name] = new_category.id
                cat_id = categories[category]

            fcontr.create(title=getattr(line, 'text', None),
                          category_id=cat_id,
                          description=getattr(line, 'description', None),
                          link=link,
                          site_link=getattr(line, 'htmlUrl', None))
            counts['created'] += 1
        code = 200
        if counts.get('created'):
            code = 201
        elif counts.get('failed'):
            code = 400
        return counts, code
Exemple #20
0
 def test_list_feeds(self):
     resp = self.jarr_client('get', 'list-feeds', user=self.user.login)
     fcount = FeedController(self.user.id).read().count()
     ccount = CategoryController(self.user.id).read().count()
     self.assertEqual(fcount + ccount + 1, len(resp.json))
     self.assertEqual(fcount,
                      len([r for r in resp.json if r['type'] == 'feed']))
     self.assertEqual(ccount,
                      len([r for r in resp.json if r['type'] == 'categ']))
Exemple #21
0
def reset_feeds():
    """Will reschedule all active feeds to be fetched in the next two hours"""
    fcontr = FeedController(ignore_context=True)
    now = utc_now()
    feeds = [
        feed[0]
        for feed in fcontr.get_active_feed().with_entities(fcontr._db_cls.id)
    ]

    step = timedelta(seconds=conf.feed.max_expires / len(feeds))
    for i, feed_id in enumerate(feeds):
        fcontr.update(
            {'id': feed_id}, {
                'etag': '',
                'last_modified': '',
                'last_retrieved': datetime(1970, 1, 1, tzinfo=timezone.utc),
                'expires': now + i * step
            })
Exemple #22
0
 def _reset_feeds_freshness(**kwargs):
     if 'expires' not in kwargs:
         kwargs['expires'] = UNIX_START
     if 'last_retrieved' not in kwargs:
         kwargs['last_retrieved'] = UNIX_START
     if 'etag' not in kwargs:
         kwargs['etag'] = ''
     if 'last_modified' not in kwargs:
         kwargs['last_modified'] = ''
     FeedController().update({}, kwargs)
Exemple #23
0
    def get():
        """
        Construct a feed from (any) url.

        Returns
        -------
        feed:
            a dictionnary with most of what's needed to contruct a feed
            plus alternative links found during parsing

        """
        code = 406
        url = url_parser.parse_args()['url']
        feed = FeedBuilderController(url).construct()
        if feed.get('link'):
            code = 200
            fctrl = FeedController(current_identity.id)
            feed['same_link_count'] = fctrl.read(link=feed.get('link')).count()
        return feed, code
Exemple #24
0
 def update(self, filters, attrs, return_objs=False, commit=True):
     user_id = attrs.get('user_id', self.user_id)
     if 'feed_id' in attrs:
         feed = FeedController().get(id=attrs['feed_id'])
         if not (self.user_id is None or feed.user_id == user_id):
             raise Forbidden("no right on feed %r" % feed.id)
         attrs['category_id'] = feed.category_id
     if attrs.get('category_id'):
         cat = CategoryController().get(id=attrs['category_id'])
         if not (self.user_id is None or cat.user_id == user_id):
             raise Forbidden("no right on cat %r" % cat.id)
     return super().update(filters, attrs, return_objs, commit)
Exemple #25
0
 def _test_unread_on_cluster(self, read_reason):
     ccontr = ClusterController()
     fcontr = FeedController()
     cluster = ccontr.read().first()
     clusterizer = Clusterizer()
     self.assertFalse(clusterizer.get_config(cluster, 'cluster_enabled'))
     self.assertTrue(clusterizer.get_config(cluster, 'cluster_wake_up'))
     ccontr.update({'id': cluster.id}, {
         'read': True,
         'read_reason': read_reason
     })
     target_feed = fcontr.read(id__ne=cluster.main_article.feed_id,
                               user_id=cluster.user_id).first()
     clusterizer = Clusterizer()
     self.assertFalse(clusterizer.get_config(target_feed,
                                             'cluster_enabled'))
     fcontr.update(
         {'id__in': [f.id for f in cluster.feeds] + [target_feed.id]}, {
             'cluster_wake_up': True,
             'cluster_enabled': True
         })
     clusterizer = Clusterizer()
     self.assertTrue(clusterizer.get_config(cluster, 'cluster_enabled'))
     target_feed = fcontr.read(id__ne=cluster.main_article.feed_id,
                               user_id=cluster.user_id).first()
     article = self._clone_article(ArticleController(),
                                   cluster.main_article, target_feed)
     clusterizer = Clusterizer()
     self.assertTrue(clusterizer.get_config(article, 'cluster_wake_up'))
     ClusterController(cluster.user_id).clusterize_pending_articles()
     self.assertEqual(2, len(article.cluster.articles))
     self.assertInCluster(article, cluster)
     return ccontr.get(id=cluster.id)
 def test_cluster_disabled_on_original_category(self):
     article = ArticleController().read(category_id__ne=None).first()
     art_cat_id = article.category_id
     cat_ctrl = CategoryController(article.user_id)
     cluster = article.cluster
     fctrl = FeedController(cluster.user_id)
     feed = fctrl.create(title='new feed', category_id=art_cat_id)
     fno_cat = fctrl.create(title='category-less')
     update_on_all_objs(users=[cluster.user], cluster_enabled=None)
     cat_ctrl.update({}, {'cluster_enabled': False})
     article = self.create_article_from(cluster, feed)
     self.assertEqual(1, len(article.cluster.articles))
     self.assertNotInCluster(article, cluster)
     article = self.create_article_from(cluster, fno_cat)
     self.assertEqual(1, len(article.cluster.articles))
     self.assertNotInCluster(article, cluster)
     cat_ctrl.update({'id': art_cat_id}, {'cluster_enabled': True})
     article = self.create_article_from(cluster, fno_cat)
     self.assertEqual(2, len(article.cluster.articles))
     self.assertInCluster(article, cluster)
     article = self.create_article_from(cluster, feed)
     self.assertEqual(3, len(article.cluster.articles))
     self.assertInCluster(article, cluster)
Exemple #27
0
 def create(self, **attrs):
     # handling special denorm for article rights
     if 'feed_id' not in attrs:
         raise Unauthorized("must provide feed_id when creating article")
     feed = FeedController(attrs.get('user_id',
                                     self.user_id)).get(id=attrs['feed_id'])
     if 'user_id' in attrs and not (feed.user_id == attrs['user_id']
                                    or self.user_id is None):
         raise Forbidden("no right on feed %r" % feed.id)
     attrs['user_id'], attrs['category_id'] = feed.user_id, feed.category_id
     attrs['vector'] = to_vector(attrs)
     if not attrs.get('link_hash') and attrs.get('link'):
         attrs['link_hash'] = digest(attrs['link'], alg='sha1', out='bytes')
     return super().create(**attrs)
Exemple #28
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    queue = Queues.CRAWLING if conf.crawler.use_queues else Queues.DEFAULT
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched on queue:%r",
                     feed, queue.value)
        process_feed.apply_async(args=[feed.id], queue=queue.value)
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
    # applying clusterizer
    queue = Queues.CLUSTERING if conf.crawler.use_queues else Queues.DEFAULT
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            logger.debug('Scheduling clusterizer for User(%d) on queue:%r',
                         user_id, queue.value)
            clusterizer.apply_async(args=[user_id], queue=queue.value)
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    metrics_users_any.apply_async()
    metrics_users_active.apply_async()
    metrics_users_long_term.apply_async()
    metrics_articles_unclustered.apply_async()
    observe_worker_result_since(start, 'scheduler', 'ok')
Exemple #29
0
 def test_MarkClustersAsRead_put_only_singles(self):
     feed = FeedController(self.user.id).read()[0]
     update_on_all_objs(feeds=[feed],
                        cluster_same_feed=True,
                        cluster_enabled=True)
     # creating a new article that will cluster
     ArticleController(self.user.id).create(entry_id='new entry_id',
                                            title='new title',
                                            content='new content',
                                            feed_id=feed.id,
                                            link=feed.articles[0].link)
     ClusterController(self.user.id).clusterize_pending_articles()
     self.assertClusterCount(18, {'filter': 'unread'})
     # one per feed
     self._mark_as_read(2, {'only_singles': True, 'filter': 'unread'})
     self.assertClusterCount(1, {'filter': 'unread'})
Exemple #30
0
 def get():
     user_id = current_identity.id
     user = UserController(user_id).get(id=user_id)
     categories = {
         cat.id: cat
         for cat in CategoryController(user_id).read()
     }
     response = make_response(
         render_template('opml.xml',
                         user=user,
                         categories=categories,
                         feeds=FeedController(user_id).read(),
                         now=utc_now()))
     for key, value in OK_GET_HEADERS.items():
         response.headers[key] = value
     return response