Ejemplo n.º 1
0
    def test_delete_orphans_with_exclusions(self):
        """Verify an orphan is not deleted if it is passed in as excluded"""
        orphan = VideoFactory()

        delete_orphans(Video, {orphan.pk})

        self.assertTrue(orphan.__class__.objects.filter(pk=orphan.pk).exists())
Ejemplo n.º 2
0
    def test_delete_orphans(self):
        """ Verify the delete_orphans method deletes orphaned instances. """
        orphan = VideoFactory()
        used = CourseRunFactory().video

        delete_orphans(Video)

        assert used.__class__.objects.filter(pk=used.pk).exists()
        assert not orphan.__class__.objects.filter(pk=orphan.pk).exists()
Ejemplo n.º 3
0
 def delete_orphans(cls):
     """ Remove orphaned objects from the database. """
     for model in (Image, Video):
         delete_orphans(model)
    def handle(self, *args, **options):
        # We only want to invalidate the API response cache once data loading
        # completes. Disconnecting the api_change_receiver function from post_save
        # and post_delete signals prevents model changes during data loading from
        # repeatedly invalidating the cache.
        for model in apps.get_app_config('course_metadata').get_models():
            for signal in (post_save, post_delete):
                signal.disconnect(receiver=api_change_receiver, sender=model)

        # For each partner defined...
        partners = Partner.objects.all()

        # If a specific partner was indicated, filter down the set
        partner_code = options.get('partner_code')
        if partner_code:
            partners = partners.filter(short_code=partner_code)

        if not partners:
            raise CommandError('No partners available!')

        success = True
        for partner in partners:

            # The Linux kernel implements copy-on-write when fork() is called to create a new
            # process. Pages that the parent and child processes share, such as the database
            # connection, are marked read-only. If a write is performed on a read-only page
            # (e.g., closing the connection), it is then copied, since the memory is no longer
            # identical between the two processes. This leads to the following behavior:
            #
            # 1) Newly forked process
            #       parent
            #              -> connection (Django open, MySQL open)
            #       child
            #
            # 2) Child process closes the connection
            #       parent -> connection (*Django open, MySQL closed*)
            #       child  -> connection (Django closed, MySQL closed)
            #
            # Calling connection.close() from a child process causes the MySQL server to
            # close a connection which the parent process thinks is still usable. Since
            # the parent process thinks the connection is still open, Django won't attempt
            # to open a new one, and the parent ends up running a query on a closed connection.
            # This results in a 'MySQL server has gone away' error.
            #
            # To resolve this, we force Django to reconnect to the database before running any queries.
            connection.connect()

            # If no courses exist for this partner, this command is likely being run on a
            # new catalog installation. In that case, we don't want multiple threads racing
            # to create courses. If courses do exist, this command is likely being run
            # as an update, significantly lowering the probability of race conditions.
            courses_exist = Course.objects.filter(partner=partner).exists()
            is_threadsafe = courses_exist and waffle.switch_is_active('threaded_metadata_write')
            max_workers = DataLoaderConfig.get_solo().max_workers

            logger.info(
                'Command is{negation} using threads to write data.'.format(negation='' if is_threadsafe else ' not')
            )

            pipeline = (
                (
                    (CoursesApiDataLoader, partner.courses_api_url, max_workers),
                ),
                (
                    (EcommerceApiDataLoader, partner.ecommerce_api_url, 1),
                    (ProgramsApiDataLoader, partner.programs_api_url, max_workers),
                ),
                (
                    (AnalyticsAPIDataLoader, partner.analytics_url, 1),
                ),
            )

            if waffle.switch_is_active('parallel_refresh_pipeline'):
                futures = []
                for stage in pipeline:
                    with concurrent.futures.ProcessPoolExecutor() as executor:
                        for loader_class, api_url, max_workers in stage:
                            if api_url:
                                logger.info('Executing Loader [{}]'.format(api_url))
                                futures.append(executor.submit(
                                    execute_parallel_loader,
                                    loader_class,
                                    partner,
                                    api_url,
                                    max_workers,
                                    is_threadsafe,
                                ))

                success = success and all(f.result() for f in futures)
            else:
                # Flatten pipeline and run serially.
                for loader_class, api_url, max_workers in itertools.chain(*(stage for stage in pipeline)):
                    if api_url:
                        logger.info('Executing Loader [{}]'.format(api_url))
                        success = execute_loader(
                            loader_class,
                            partner,
                            api_url,
                            max_workers,
                            is_threadsafe,
                        ) and success

            # TODO Cleanup CourseRun overrides equivalent to the Course values.

        connection.connect()  # reconnect to django outside of loop (see connect comment above)

        # Clean up any media orphans that we might have created
        delete_orphans(Image)
        delete_orphans(Video)

        set_api_timestamp()

        if not success:
            raise CommandError('One or more of the data loaders above failed.')
Ejemplo n.º 5
0
 def delete_orphans(cls):
     """ Remove orphaned objects from the database. """
     for model in (Image, Person, Video):
         delete_orphans(model)