Beispiel #1
0
    def test_look_for_workcluster_deduplication_task(self, Lock, _):
        lock = Lock.return_value
        lock.__enter__ = Mock(return_value=None)
        lock.__exit__ = Mock(return_value=None)

        works = Work.objects.filter(id__in=self.work_ids)
        # Create duplicates WorkClusters on purpose.
        for _ in range(5):
            create_work_cluster(works, perform_union=False)

        self.assertEqual(WorkCluster.objects.count(), 5)
        tasks.look_for_workclusters()
        # All duplicates have been reduced to one WorkCluster.
        self.assertEqual(WorkCluster.objects.count(), 1)
        self.assertEqual(lock.__enter__.call_count, 1)
Beispiel #2
0
    def test_merge_work_clusters(self):
        works = Work.objects.filter(id__in=self.work_ids)
        clusters = []

        for _ in range(5):
            clusters.append(create_work_cluster(works, perform_union=False))

        self.assertEqual(WorkCluster.objects.count(), 5)

        merge_work_clusters(*clusters)

        self.assertEqual(WorkCluster.objects.count(), 1)
Beispiel #3
0
def look_for_workclusters(steal_workcluster: bool = False):
    """
    A maintenance Celery Task which clusters works in the database,
    creating WorkCluster objects.

    Args:
        steal_workcluster (bool): Allow for this task to merge non-automatic WorkClusters with automatic ones.
            (i.e. if a WorkCluster is deemed to be the same but its user is human,
            we would steal or not its WorkCluster to merge it with a new one).

    Returns: None.

    """

    logger.info('Looking for easy WorkCluster to create...')
    with redis_lock.Lock(redis.StrictRedis(connection_pool=redis_pool),
                         'lock-wc-lookout',
                         expire=DEFAULT_LOCK_EXPIRATION_TIME):
        logger.info('Acquired Redis lock.')
        # MAL-created duplicates
        duplicates = Work.objects.values('title', 'category_id').annotate(
            Count('id')).filter(id__count__gte=2)
        for dupe in duplicates.iterator():
            works = Work.objects.filter(
                title=dupe['title']).prefetch_related('workcluster_set')
            cluster = create_work_cluster(works)
            logger.info('Clustered {} works. ({})'.format(
                len(works), cluster.id))

        logger.info('Clustering done.')
        logger.info('Compressing redundant work clusters.')
        for work in Work.objects.prefetch_related(
                'workcluster_set').iterator():
            # Only merge automatic unprocessed work clusters.
            cluster_filter = Q(status='unprocessed')
            if not steal_workcluster:  # Don't be evil. Don't steal human WorkClusters.
                cluster_filter |= Q(user=None)
            clusters = work.workcluster_set.filter(cluster_filter).order_by(
                'id').all()
            if len(clusters) > 1:
                merge_work_clusters(*clusters)
                logger.info('{} clusters merged.'.format(len(clusters)))
        logger.info('Compression done.')
Beispiel #4
0
    def test_create_work_clusters_with_union(self):
        works = Work.objects.filter(id__in=self.work_ids)

        for _ in range(5):
            create_work_cluster(works, perform_union=True)
            self.assertEqual(WorkCluster.objects.count(), 1)