Beispiel #1
0
    def test_articles_with_enclosure_and_fetched_content(self, truncated_cnt,
                                                         get_vector):
        self._clean_objs()
        get_vector.return_value = None
        truncated_cnt.return_value = {'type': 'fetched',
                                      'title': 'holy grail',
                                      'content': 'blue, no read, aaah',
                                      'link': 'https://monthy.python/brian'}
        feed = FeedController().read().first()
        FeedController().update({'id': feed.id},
                                {'truncated_content': True,
                                 'cluster_enabled': True})
        UserController().update({'id': feed.user_id},
                                {'cluster_enabled': True})

        builder = ClassicArticleBuilder(feed, self.entry_w_enclosure, {})
        self.assertIsNone(builder.article.get('article_type'))
        raw_articles = list(builder.enhance())
        self.assertEqual(2, len(raw_articles))
        self.assertEqual('audio', raw_articles[1]['article_type'].value)
        articles = []
        for raw_article in raw_articles:
            articles.append(
                ArticleController(feed.user_id).create(**raw_article))
        ClusterController(feed.user_id).clusterize_pending_articles()
        a1 = ArticleController().get(id=articles[0].id)
        a2 = ArticleController().get(id=articles[1].id)
        self.assertEqual(a1.cluster_id, a2.cluster_id)
        cluster = ClusterController().get(id=a1.cluster_id)
        self.assertEqual(2, cluster.content['v'])
        self.assertEqual(1, len(cluster.content['contents']))
        self.assertEqual('fetched', cluster.content['contents'][0]['type'])
Beispiel #2
0
    def test_no_add_feed_skip(self):
        self.resp_status_code = 304
        self.assertEqual(BASE_COUNT, ArticleController().read().count())
        crawler()
        FeedController().update({}, {
            'filters': [{
                "type": "tag contains",
                "action on": "match",
                "pattern": "pattern5",
                "action": "skipped"
            }, {
                "type": "simple match",
                "action on": "match",
                "pattern": "pattern5",
                "action": "mark as read"
            }, {
                "type": "regex",
                "action on": "match",
                "pattern": "pattern5",
                "action": "skipped"
            }]
        })

        crawler()
        self.assertEqual(BASE_COUNT, ArticleController().read().count())
 def create_article_from(self, cluster, feed, link=None):
     self.assertEqual(cluster.user_id, feed.user_id)
     suffix = str(randint(0, 9999))
     acontr = ArticleController(cluster.user_id)
     article = acontr.create(
         feed_id=feed.id,
         entry_id=cluster.main_article.entry_id + suffix,
         link=link or cluster.main_article.link,
         title=cluster.main_article.title + suffix,
         content=cluster.main_article.content + suffix,
         date=cluster.main_article.date + timedelta(1),
         retrieved_date=cluster.main_article.retrieved_date)
     ClusterController(cluster.user_id).clusterize_pending_articles()
     return acontr.read(id=article.id).first()
Beispiel #4
0
    def test_adding_to_cluster_by_link(self):
        ccontr = ClusterController()

        cluster = ccontr.read().first()
        ccontr.update({'id': cluster.id}, {
            'read': True,
            'read_reason': 'marked'
        })
        cluster = ccontr.get(id=cluster.id)
        self.assertTrue(cluster.read)
        article = cluster.articles[0]
        articles_count = len(cluster.articles)

        fcontr = FeedController(cluster.user_id)
        acontr = ArticleController(cluster.user_id)
        fcontr.update({'id': article.feed_id}, {'cluster_wake_up': True})
        feed = fcontr.read(id__ne=article.feed_id).first()
        update_on_all_objs(articles=[article],
                           feeds=[feed],
                           cluster_enabled=True)

        self._clone_article(acontr, article, feed)
        ccontr.clusterize_pending_articles()

        cluster = ccontr.get(id=cluster.id)
        self.assertEqual(articles_count + 1, len(cluster.articles))
        self.assertFalse(cluster.read)
Beispiel #5
0
 def _test_unread_on_cluster(self, read_reason):
     ccontr = ClusterController()
     fcontr = FeedController()
     cluster = ccontr.read().first()
     clusterizer = Clusterizer()
     self.assertFalse(clusterizer.get_config(cluster, 'cluster_enabled'))
     self.assertTrue(clusterizer.get_config(cluster, 'cluster_wake_up'))
     ccontr.update({'id': cluster.id}, {
         'read': True,
         'read_reason': read_reason
     })
     target_feed = fcontr.read(id__ne=cluster.main_article.feed_id,
                               user_id=cluster.user_id).first()
     clusterizer = Clusterizer()
     self.assertFalse(clusterizer.get_config(target_feed,
                                             'cluster_enabled'))
     fcontr.update(
         {'id__in': [f.id for f in cluster.feeds] + [target_feed.id]}, {
             'cluster_wake_up': True,
             'cluster_enabled': True
         })
     clusterizer = Clusterizer()
     self.assertTrue(clusterizer.get_config(cluster, 'cluster_enabled'))
     target_feed = fcontr.read(id__ne=cluster.main_article.feed_id,
                               user_id=cluster.user_id).first()
     article = self._clone_article(ArticleController(),
                                   cluster.main_article, target_feed)
     clusterizer = Clusterizer()
     self.assertTrue(clusterizer.get_config(article, 'cluster_wake_up'))
     ClusterController(cluster.user_id).clusterize_pending_articles()
     self.assertEqual(2, len(article.cluster.articles))
     self.assertInCluster(article, cluster)
     return ccontr.get(id=cluster.id)
Beispiel #6
0
    def main(self, article, filter_result=None):
        """Will add given article to a fitting cluster or create a cluster
        fitting that article."""
        filter_result = filter_result or {}
        allow_clustering = filter_result.get('clustering', True)
        filter_read = filter_result.get('read', False)
        filter_liked = filter_result.get('liked', False)
        logger.info('%r - processed filter: %r', article, filter_result)
        cluster_config = self.get_config(article.feed, 'cluster_enabled')

        # fetching article so that vector comparison is made on full content
        ArticleController(article.user_id).enhance(article)

        if not allow_clustering:
            cluster_event(context='clustering', result='filter forbid')
        elif not cluster_config:
            cluster_event(context='clustering', result='config forbid')
        else:
            cluster = self._get_cluster_by_link(article)
            if not cluster:
                if not self.get_config(article.feed, 'cluster_tfidf_enabled'):
                    cluster_event(context='tfidf', result='config forbid')
                elif article.article_type in NO_CLUSTER_TYPE:
                    cluster_event(context='tfidf', result='wrong article type')
                else:
                    cluster = self._get_cluster_by_similarity(article)
            if cluster:
                return self.enrich_cluster(cluster, article, filter_read,
                                           filter_liked)
        return self._create_from_article(article, filter_read, filter_liked)
Beispiel #7
0
 def test_feed_and_article_deletion(self):
     ccontr = CategoryController(2)
     cat = ccontr.read().first()
     ccontr.delete(cat.id)
     self.assertEqual(0,
                      ArticleController().read(category_id=cat.id).count())
     self.assertEqual(0, FeedController().read(category_id=cat.id).count())
Beispiel #8
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched", feed)
        process_feed.apply_async(args=[feed.id])
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
            break  # only one at a time
    # applying clusterizer
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if not UserController().get(id=user_id).effectivly_active:
            continue
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            clusterizer.apply_async(args=[user_id])
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    WORKER.labels(method='scheduler').observe(
        (datetime.now() - start).total_seconds())
    update_slow_metrics.apply_async()
Beispiel #9
0
    def test_http_crawler_add_articles(self):
        self.assertEqual(BASE_COUNT, ArticleController().read().count())

        crawler()
        articles = list(ArticleController().read())
        new_count = len(articles)
        self.assertNotEqual(BASE_COUNT, new_count)
        self.assertTrue(BASE_COUNT < new_count)

        for art in articles:
            self.assertFalse('srcset=' in art.content)
            self.assertFalse('src="/' in art.content)

        self.resp_status_code = 304
        crawler()
        self.assertEqual(new_count, ArticleController().read().count())
 def test_cluster_same_feed(self):
     article = ArticleController().read(category_id__ne=None).first()
     cluster = article.cluster
     # all is enabled, article in cluster
     update_on_all_objs(articles=cluster.articles,
                        cluster_enabled=True,
                        cluster_same_feed=True)
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertInCluster(article, cluster)
     # feed's disabled, won't cluster
     FeedController().update(
         {'id__in': [a.feed_id for a in cluster.articles]},
         {'cluster_same_feed': False})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertNotInCluster(article, cluster)
     # category's disabled, won't cluster
     FeedController().update(
         {'id__in': [a.feed_id for a in cluster.articles]},
         {'cluster_same_feed': None})
     CategoryController().update({'id': cluster.main_article.category.id},
                                 {'cluster_same_feed': False})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertNotInCluster(article, cluster)
     # user's disable, won't cluster
     CategoryController().update({'id': cluster.main_article.category.id},
                                 {'cluster_same_feed': None})
     UserController().update({'id': cluster.user_id},
                             {'cluster_same_feed': False})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertNotInCluster(article, cluster)
     # reenabling user, will cluster
     UserController().update({'id': cluster.user_id},
                             {'cluster_same_feed': True})
     article = self.create_article_from(cluster, cluster.main_article.feed)
     self.assertInCluster(article, cluster)
    def test_cluster_tfidf_control(self):
        article = ArticleController().read(category_id__ne=None).first()
        cluster = article.cluster

        # leaving one cluster with one article
        clu_ids = [c.id for c in ClusterController().read(id__ne=cluster.id)]
        art_ids = [
            a.id
            for a in ArticleController().read(id__ne=cluster.main_article_id)
        ]
        ArticleController().update({'id__in': art_ids}, {'cluster_id': None})
        for clu_id in clu_ids:
            ClusterController().delete(clu_id)
        for art_id in art_ids:
            ArticleController().delete(art_id)
        self.assertEqual(1, ClusterController().read().count())
        self.assertEqual(1, ArticleController().read().count())

        feed1 = FeedController(cluster.user_id).create(
            title='new feed',
            cluster_conf={
                'tfidf_min_score': -1,
                'tfidf_min_sample_size': 1
            })
        update_on_all_objs(articles=cluster.articles,
                           feeds=[feed1],
                           cluster_tfidf_enabled=True,
                           cluster_enabled=True)
        feed2 = FeedController(cluster.user_id).create(
            cluster_enabled=True,
            cluster_tfidf_enabled=False,
            title='new feed',
            cluster_conf={
                'tfidf_min_score': -1,
                'tfidf_min_sample_size': 1
            })

        article = self.create_article_from(cluster,
                                           feed1,
                                           link=cluster.main_article.link +
                                           'do not match link')
        self.assertInCluster(article, cluster, ClusterReason.tf_idf)
        article = self.create_article_from(cluster,
                                           feed2,
                                           link=cluster.main_article.link +
                                           'do not match link either')
        self.assertNotInCluster(article, cluster)
Beispiel #12
0
    def test_delete_main_cluster_handling(self):
        suffix = 'suffix'
        clu = ClusterController().get(id=10)
        acontr = ArticleController(clu.user_id)
        fcontr = FeedController(clu.user_id)
        old_title = clu.main_title
        old_feed_title, old_art_id = clu.main_feed_title, clu.main_article_id
        for art_to_del in acontr.read(link=clu.main_article.link,
                                      id__ne=clu.main_article.id):
            acontr.delete(art_to_del.id)

        other_feed = fcontr.read(id__ne=clu.main_article.feed_id).first()
        update_on_all_objs(articles=[clu.main_article],
                           feeds=[other_feed],
                           cluster_enabled=True)
        acontr.create(
            feed_id=other_feed.id,
            entry_id=clu.main_article.entry_id + suffix,
            link=clu.main_article.link,
            title=clu.main_article.title + suffix,
            content=clu.main_article.content + suffix,
            date=clu.main_article.date + timedelta(1),
            retrieved_date=clu.main_article.retrieved_date + timedelta(1),
        )

        ClusterController(clu.user_id).clusterize_pending_articles()
        clu = ClusterController().get(id=10)
        self.assertEqual(2, len(clu.articles))
        fcontr.delete(clu.main_article.feed_id)
        new_cluster = ClusterController(clu.user_id).get(id=clu.id)
        self.assertEqual(1, len(new_cluster.articles))
        self.assertNotEqual(old_title, new_cluster.main_title)
        self.assertNotEqual(old_feed_title, new_cluster.main_feed_title)
        self.assertNotEqual(old_art_id, new_cluster.main_article_id)
Beispiel #13
0
def populate_db():
    fcontr = FeedController()
    ccontr = CategoryController()
    UserController().create(
        **{
            'is_admin': True,
            'is_api': True,
            'cluster_enabled': False,
            'login': '******',
            'password': '******'
        })
    user1, user2 = [
        UserController().create(login=name,
                                cluster_enabled=False,
                                email="*****@*****.**" % name,
                                password=name) for name in ["user1", "user2"]
    ]

    for iteration in range(2):
        article_total = 0

        for user in (user1, user2):
            for iter_cat in range(3):
                cat_id = None
                if iter_cat:
                    cat_id = ccontr.create(user_id=user.id,
                                           name=to_name(
                                               user, iteration, iter_cat)).id
                feed_id = fcontr.create(
                    link="feed%d%d" % (iteration, iter_cat),
                    user_id=user.id,
                    category_id=cat_id,
                    title=to_name(user, iteration, iter_cat, iter_cat)).id
                for iter_art in range(3):
                    entry = to_name(user, iteration, iter_cat, iter_cat,
                                    iter_art)

                    tags = [
                        to_name(user, iteration, iter_cat, iter_cat, iter_art,
                                str(i)) for i in range(2)
                    ]
                    article_total += 1
                    ArticleController().create(
                        entry_id=entry,
                        link='http://test.te/%d' % article_total,
                        feed_id=feed_id,
                        user_id=user.id,
                        tags=tags,
                        category_id=cat_id,
                        title=entry,
                        date=utc_now() + timedelta(seconds=iteration),
                        content="content %d" % article_total)

    session.commit()
    session.flush()
    ClusterController().clusterize_pending_articles()
Beispiel #14
0
    def test_matching_etag(self):
        self._reset_feeds_freshness(etag='fake etag')
        self.resp_headers = {'etag': 'fake etag'}
        self.assertEqual(BASE_COUNT, ArticleController().read().count())

        crawler()

        self.assertEqual(BASE_COUNT, ArticleController().read().count())
        self._reset_feeds_freshness(etag='jarr/"%s"' % to_hash(self._content))
        self.resp_headers = {'etag': 'jarr/"%s"' % to_hash(self._content)}

        crawler()
        self.assertEqual(BASE_COUNT, ArticleController().read().count())

        self._reset_feeds_freshness(etag='jarr/fake etag')
        self.resp_headers = {'etag': '########################'}

        crawler()
        self.assertNotEqual(BASE_COUNT, ArticleController().read().count())
Beispiel #15
0
 def test_articles_with_enclosure(self):
     self._clean_objs()
     feed = FeedController().read().first()
     UserController().update({'id': feed.user_id},
                             {'cluster_enabled': True})
     builder = ClassicArticleBuilder(feed, self.entry_w_enclosure, {})
     self.assertIsNone(builder.article.get('article_type'))
     raw_articles = list(builder.enhance())
     self.assertEqual(2, len(raw_articles))
     self.assertEqual('audio', raw_articles[1]['article_type'].value)
     articles = []
     for raw_article in raw_articles:
         articles.append(
             ArticleController(feed.user_id).create(**raw_article))
     ClusterController(feed.user_id).clusterize_pending_articles()
     a1 = ArticleController().get(id=articles[0].id)
     a2 = ArticleController().get(id=articles[1].id)
     cluster = ClusterController().get(id=a1.cluster_id)
     self.assertEqual(a1.cluster_id, a2.cluster_id)
     self.assertEqual(2, cluster.content['v'])
     self.assertEqual(0, len(cluster.content['contents']))
Beispiel #16
0
    def create_missing_article(self, response):
        logger.info('%r: cache validation failed, challenging entries',
                    self.feed)
        parsed = self.parse_feed_response(response)
        if parsed is None:
            return

        ids, entries, skipped_list = [], {}, []
        for entry in parsed['entries']:
            if not entry:
                continue
            builder = self.article_builder(self.feed, entry)
            if builder.do_skip_creation:
                skipped_list.append(builder.entry_ids)
                logger.debug('%r: skipping article', self.feed)
                continue
            entry_ids = builder.entry_ids
            entries[tuple(sorted(entry_ids.items()))] = builder
            ids.append(entry_ids)
        if not ids and skipped_list:
            logger.debug('%r: nothing to add (skipped %r) %r', self.feed,
                         skipped_list, parsed)
            return
        logger.debug("%r: found %d entries %r", self.feed, len(ids), ids)

        article_created = False
        actrl = ArticleController(self.feed.user_id)
        new_entries_ids = list(actrl.challenge(ids=ids))
        logger.debug("%r: %d entries wern't matched and will be created",
                     self.feed, len(new_entries_ids))
        for id_to_create in new_entries_ids:
            article_created = True
            builder = entries[tuple(sorted(id_to_create.items()))]
            new_article = builder.enhance()
            article = actrl.create(**new_article)
            logger.info('%r: created %r', self.feed, article)

        if not article_created:
            logger.info('%r: all article matched in db, adding nothing',
                        self.feed)
Beispiel #17
0
    def test_ClusterResource_delete(self):
        cluster = ClusterController().read().first()
        user = UserController().get(id=cluster.user_id)
        resp = self.jarr_client('delete', 'cluster', cluster.id)
        self.assertStatusCode(401, resp)
        resp = self.jarr_client('delete', 'cluster', cluster.id, user='******')
        self.assertStatusCode(403, resp)
        resp = self.jarr_client('delete', 'cluster', cluster.id,
                user=user.login)
        self.assertStatusCode(204, resp)

        self.assertEqual(0, ClusterController().read(id=cluster.id).count())
        self.assertEqual(0,
                ArticleController().read(cluster_id=cluster.id).count())
Beispiel #18
0
    def test_model_relations(self):
        article = ArticleController().read(category_id__ne=None).first()
        # article relations
        self.assertIsNotNone(article.cluster)
        self.assertIsNotNone(article.category)
        self.assertIsNotNone(article.feed)
        # feed parent relation
        self.assertEqual(article.category, article.feed.category)

        self.assertIn(article.cluster, article.feed.clusters)
        self.assertIn(article.cluster, article.category.clusters)
        self.assertIn(article.feed, article.cluster.feeds)
        self.assertIn(article.category, article.cluster.categories)

        self.assertIn(article.cluster.main_article, article.cluster.articles)
Beispiel #19
0
 def test_MarkClustersAsRead_put_only_singles(self):
     feed = FeedController(self.user.id).read()[0]
     update_on_all_objs(feeds=[feed],
                        cluster_same_feed=True,
                        cluster_enabled=True)
     # creating a new article that will cluster
     ArticleController(self.user.id).create(entry_id='new entry_id',
                                            title='new title',
                                            content='new content',
                                            feed_id=feed.id,
                                            link=feed.articles[0].link)
     ClusterController(self.user.id).clusterize_pending_articles()
     self.assertClusterCount(18, {'filter': 'unread'})
     # one per feed
     self._mark_as_read(2, {'only_singles': True, 'filter': 'unread'})
     self.assertClusterCount(1, {'filter': 'unread'})
 def test_no_cluster_same_category_on_original_category(self):
     article = ArticleController().read(category_id__ne=None).first()
     art_cat_id = article.category_id
     cat_ctrl = CategoryController(article.user_id)
     cluster = article.cluster
     feed = FeedController(cluster.user_id).create(title='new feed',
                                                   category_id=art_cat_id)
     update_on_all_objs(articles=cluster.articles,
                        feeds=[feed],
                        cluster_same_category=None,
                        cluster_enabled=True)
     cat_ctrl.update({'id': art_cat_id}, {'cluster_same_category': False})
     article = self.create_article_from(cluster, feed)
     self.assertNotInCluster(article, cluster)
     cat_ctrl.update({'id': art_cat_id}, {'cluster_same_category': True})
     article = self.create_article_from(cluster, feed)
     self.assertInCluster(article, cluster)
Beispiel #21
0
    def test_opml_dump_and_restore(self):
        # downloading OPML export file
        resp = self.jarr_client('get', '/opml', user=self.user.login)
        self.assertStatusCode(200, resp)
        opml_dump = resp.data.decode()
        self.assertTrue(
            opml_dump.startswith('<?xml version="1.0" encoding="utf-8"'))
        self.assertTrue(opml_dump.endswith('</opml>'))
        # cleaning db
        actrl = ArticleController(self.user.id)
        for item in actrl.read():
            actrl.delete(item.id)
        self.assertEqual(0, ClusterController(self.user.id).read().count())
        self.assertEqual(0, ArticleController(self.user.id).read().count())
        no_category_feed = []
        existing_feeds = {}
        for feed in self.fctrl.read():
            if feed.category:
                if feed.category.name in existing_feeds:
                    existing_feeds[feed.category.name].append(feed.title)
                else:
                    existing_feeds[feed.category.name] = [feed.title]
            else:
                no_category_feed.append(feed.title)

            self.fctrl.delete(feed.id)
        for category in self.cctrl.read():
            self.cctrl.delete(category.id)
        # re-importing OPML
        import_resp = self.jarr_client(
            'post',
            'opml',
            to_json=False,
            data={'opml_file': (BytesIO(resp.data), 'opml.xml')},
            headers=None,
            user=self.user.login)
        self.assertStatusCode(201, import_resp)
        self.assertEqual(0, import_resp.json['existing'])
        self.assertEqual(0, import_resp.json['failed'])
        self._check_opml_imported(existing_feeds, no_category_feed)

        import_resp = self.jarr_client(
            'post',
            'opml',
            to_json=False,
            data={'opml_file': (BytesIO(resp.data), 'opml.xml')},
            headers=None,
            user=self.user.login)
        self.assertStatusCode(200, import_resp)
        self.assertEqual(0, import_resp.json['created'])
        self.assertEqual(0, import_resp.json['failed'])
Beispiel #22
0
    def _get_query_for_clustering(self, article, filters, filter_tfidf=False):
        time_delta = timedelta(days=conf.clustering.time_delta)
        date_cond = {
            'date__lt': article.date + time_delta,
            'date__gt': article.date - time_delta
        }
        retr_cond = {
            'retrieved_date__lt': article.retrieved_date + time_delta,
            'retrieved_date__gt': article.retrieved_date - time_delta
        }
        filters.update({
            'cluster_id__ne': None,
            'user_id': article.user_id,
            'id__ne': article.id,
            '__or__': [date_cond, retr_cond]
        })
        if article.category_id \
                and not self.get_config(article, 'cluster_same_category'):
            filters['category_id__ne'] = article.category_id
        if not self.get_config(article, 'cluster_same_feed'):
            filters['feed_id__ne'] = article.feed_id

        feed_join = [
            Feed.id == Article.feed_id,
            or_(Feed.cluster_enabled.__eq__(True),
                Feed.cluster_enabled.__eq__(None))
        ]
        if filter_tfidf:
            feed_join.append(
                or_(Feed.cluster_tfidf_enabled.__eq__(True),
                    Feed.cluster_tfidf_enabled.__eq__(None)))

        query = ArticleController(article.user_id).read(**filters)\
                .join(Feed, and_(*feed_join))

        # operations involving categories are complicated, handling in software
        for candidate in query:
            if not self.get_config(candidate, "cluster_enabled"):
                continue
            if filter_tfidf and \
                    not self.get_config(candidate, "cluster_tfidf_enabled"):
                continue
            yield candidate
 def test_cluster_disabled_on_original_category(self):
     article = ArticleController().read(category_id__ne=None).first()
     art_cat_id = article.category_id
     cat_ctrl = CategoryController(article.user_id)
     cluster = article.cluster
     fctrl = FeedController(cluster.user_id)
     feed = fctrl.create(title='new feed', category_id=art_cat_id)
     fno_cat = fctrl.create(title='category-less')
     update_on_all_objs(users=[cluster.user], cluster_enabled=None)
     cat_ctrl.update({}, {'cluster_enabled': False})
     article = self.create_article_from(cluster, feed)
     self.assertEqual(1, len(article.cluster.articles))
     self.assertNotInCluster(article, cluster)
     article = self.create_article_from(cluster, fno_cat)
     self.assertEqual(1, len(article.cluster.articles))
     self.assertNotInCluster(article, cluster)
     cat_ctrl.update({'id': art_cat_id}, {'cluster_enabled': True})
     article = self.create_article_from(cluster, fno_cat)
     self.assertEqual(2, len(article.cluster.articles))
     self.assertInCluster(article, cluster)
     article = self.create_article_from(cluster, feed)
     self.assertEqual(3, len(article.cluster.articles))
     self.assertInCluster(article, cluster)
Beispiel #24
0
def scheduler():
    logger.warning("Running scheduler")
    start = datetime.now()
    fctrl = FeedController()
    # browsing feeds to fetch
    queue = Queues.CRAWLING if conf.crawler.use_queues else Queues.DEFAULT
    feeds = list(fctrl.list_fetchable(conf.crawler.batch_size))
    WORKER_BATCH.labels(worker_type='fetch-feed').observe(len(feeds))
    logger.info('%d to enqueue', len(feeds))
    for feed in feeds:
        logger.debug("%r: scheduling to be fetched on queue:%r",
                     feed, queue.value)
        process_feed.apply_async(args=[feed.id], queue=queue.value)
    # browsing feeds to delete
    feeds_to_delete = list(fctrl.read(status=FeedStatus.to_delete))
    if feeds_to_delete and REDIS_CONN.setnx(JARR_FEED_DEL_KEY, 'true'):
        REDIS_CONN.expire(JARR_FEED_DEL_KEY, LOCK_EXPIRE)
        logger.info('%d to delete, deleting one', len(feeds_to_delete))
        for feed in feeds_to_delete:
            logger.debug("%r: scheduling to be delete", feed)
            feed_cleaner.apply_async(args=[feed.id])
    # applying clusterizer
    queue = Queues.CLUSTERING if conf.crawler.use_queues else Queues.DEFAULT
    for user_id in ArticleController.get_user_id_with_pending_articles():
        if REDIS_CONN.setnx(JARR_CLUSTERIZER_KEY % user_id, 'true'):
            REDIS_CONN.expire(JARR_CLUSTERIZER_KEY % user_id,
                              conf.crawler.clusterizer_delay)
            logger.debug('Scheduling clusterizer for User(%d) on queue:%r',
                         user_id, queue.value)
            clusterizer.apply_async(args=[user_id], queue=queue.value)
    scheduler.apply_async(countdown=conf.crawler.idle_delay)
    metrics_users_any.apply_async()
    metrics_users_active.apply_async()
    metrics_users_long_term.apply_async()
    metrics_articles_unclustered.apply_async()
    observe_worker_result_since(start, 'scheduler', 'ok')
Beispiel #25
0
 def test_feed_rights(self):
     feed = FeedController(2).read()[0]
     self.assertEqual(3, ArticleController().read(feed_id=feed.id).count())
     self._test_controller_rights(feed,
                                  UserController().get(id=feed.user_id))
Beispiel #26
0
def metrics_articles_unclustered():
    logger.debug('Counting unclustered articles')
    unclustered = ArticleController().count_unclustered()
    ARTICLES.labels(status='unclustered').set(unclustered)
Beispiel #27
0
 def test_feed_rights(self):
     cat = CategoryController(2).read().first()
     self.assertEqual(3,
                      ArticleController().read(category_id=cat.id).count())
     self.assertEqual(1, FeedController().read(category_id=cat.id).count())
     self._test_controller_rights(cat, UserController().get(id=cat.user_id))
Beispiel #28
0
    def _test_create_using_filters(self):
        # FIXME wait redo filters
        feed_ctr = FeedController(USER_ID)
        acontr = ArticleController(USER_ID)
        feed1, feed2, feed3 = [f for f in feed_ctr.read()][0:3]
        feed_ctr.update({'id': feed3.id}, {
            'cluster_enabled':
            True,
            'filters': [{
                "type": "regex",
                "pattern": ".*(pattern1|pattern2).*",
                "action on": "no match",
                "action": "mark as favorite"
            }, {
                "type": "simple match",
                "pattern": "pattern3",
                "action on": "match",
                "action": "mark as read"
            }]
        })
        feed_ctr.update({'id': feed1.id}, {
            'filters': [{
                "type": "simple match",
                "pattern": "pattern3",
                "action on": "match",
                "action": "mark as read"
            }]
        })
        feed_ctr.update({'id': feed2.id}, {
            'filters': [{
                "type": "tag match",
                "pattern": "pattern4",
                "action on": "match",
                "action": "skipped"
            }, {
                "type": "tag contains",
                "pattern": "pattern5",
                "action on": "match",
                "action": "skipped"
            }]
        })

        art1 = acontr.create(entry_id="will be read and faved 1",
                             feed_id=feed1.id,
                             title="garbage pattern1 pattern3 garbage",
                             content="doesn't matter",
                             link="cluster1")

        art2 = acontr.create(entry_id="will be ignored 2",
                             feed_id=feed1.id,
                             title="garbage see pattern garbage",
                             content="doesn't matter2",
                             link="is ignored 2")

        art3 = acontr.create(entry_id="will be read 3",
                             user_id=2,
                             feed_id=feed2.id,
                             title="garbage pattern3 garbage",
                             content="doesn't matter",
                             link="doesn't matter either3")

        art4 = acontr.create(entry_id="will be ignored 4",
                             user_id=2,
                             feed_id=feed2.id,
                             title="garbage see pattern garbage",
                             content="doesn't matter2",
                             link="doesn't matter either4")

        art5 = acontr.create(entry_id="will be faved 5",
                             feed_id=feed3.id,
                             title="garbage anti-attern3 garbage",
                             content="doesn't matter",
                             link="cluster1")
        art6 = acontr.create(entry_id="will be faved 6",
                             feed_id=feed3.id,
                             title="garbage pattern1 garbage",
                             content="doesn't matter2",
                             link="doesn't matter 6")
        art7 = acontr.create(entry_id="will be read 7",
                             feed_id=feed3.id,
                             title="garbage pattern3 garbage",
                             content="doesn't matter3",
                             link="doesn't matter either7")

        art8 = acontr.create(entry_id="will be ignored",
                             feed_id=feed3.id,
                             title="garbage pattern4 garbage",
                             content="doesn't matter4-matter4_matter4",
                             lang='fa_ke',
                             link="doesn't matter either8")

        art9 = acontr.create(entry_id="unique9",
                             feed_id=feed2.id,
                             title="garbage",
                             tags=['garbage', 'pattern4'],
                             content="doesn't matterç",
                             link="doesn't matter either9")

        art10 = acontr.create(entry_id="will be ignored",
                              feed_id=feed2.id,
                              title="garbage",
                              tags=['pattern5 garbage', 'garbage'],
                              content="doesn't matter10",
                              link="doesn't matter either10")

        ClusterController(USER_ID).clusterize_pending_articles()

        self.assertTrue(acontr.get(id=art1.id).cluster.read)
        self.assertFalse(acontr.get(id=art1.id).cluster.liked)
        self.assertFalse(acontr.get(id=art2.id).cluster.read)
        self.assertFalse(acontr.get(id=art2.id).cluster.liked)
        self.assertFalse(acontr.get(id=art3.id).cluster.read)
        self.assertFalse(acontr.get(id=art3.id).cluster.liked)
        self.assertFalse(acontr.get(id=art4.id).cluster.read)
        self.assertFalse(acontr.get(id=art4.id).cluster.liked)
        self.assertTrue(art5.cluster.read,
                        "should be read because it clustered")
        self.assertTrue(art5.cluster.liked)
        self.assertFalse(art6.cluster.read)
        self.assertFalse(art6.cluster.liked)
        self.assertTrue(art7.cluster.read)
        self.assertTrue(art7.cluster.liked)
        self.assertFalse(art8.cluster.read)
        self.assertTrue(art8.cluster.liked)
        self.assertIsNone(art9)
        self.assertEqual(0, acontr.read(entry_id='unique9').count())
        self.assertIsNone(art10)
        self.assertEqual(0, acontr.read(entry_id='unique10').count())
Beispiel #29
0
 def test_article_rights(self):
     article = ArticleController(USER_ID).read().first()
     self._test_controller_rights(article,
                                  UserController().get(id=article.user_id))
Beispiel #30
0
 def delete(self, obj_id, commit=True):
     from jarr.controllers import ArticleController, ClusterController
     fltr = {"user_id": obj_id}
     ClusterController(self.user_id).update(fltr, {"main_article_id": None})
     ArticleController(self.user_id).update(fltr, {"cluster_id": None})
     return super().delete(obj_id)