Ejemplo n.º 1
0
 def test_no_nodes_present_peer(self, channel_stats_mock):
     ContentNode.objects.update(available=False)
     LocalFile.objects.update(available=False)
     stats = {}
     channel_stats_mock.return_value = stats
     _, files_to_transfer, _ = get_import_export_data(
         self.the_channel_id, [], [], False, renderable_only=False, peer_id="1"
     )
     self.assertEqual(len(files_to_transfer), 0)
Ejemplo n.º 2
0
 def test_one_node_present_peer(self, channel_stats_mock):
     ContentNode.objects.update(available=False)
     LocalFile.objects.update(available=False)
     obj = ContentNode.objects.get(title="c2c1")
     stats = {obj.id: {}}
     channel_stats_mock.return_value = stats
     _, files_to_transfer, _ = get_import_export_data(
         self.the_channel_id, [], [], False, renderable_only=False, peer_id="1"
     )
     self.assertEqual(len(files_to_transfer), obj.files.count())
Ejemplo n.º 3
0
 def test_all_nodes_present_peer(self, channel_stats_mock):
     ContentNode.objects.update(available=False)
     LocalFile.objects.update(available=False)
     stats = {
         key: {} for key in ContentNode.objects.all().values_list("id", flat=True)
     }
     channel_stats_mock.return_value = stats
     _, files_to_transfer, _ = get_import_export_data(
         self.the_channel_id, [], [], False, renderable_only=False, peer_id="1"
     )
     self.assertEqual(
         len(files_to_transfer), LocalFile.objects.filter(available=False).count()
     )
Ejemplo n.º 4
0
    def post(self, request):
        try:
            channel_id = self.request.data["channel_id"]
        except KeyError:
            raise ValidationError(
                "channel_id is required for calculating file size and resource counts"
            )
        drive_id = self.request.data.get("drive_id")
        peer_id = self.request.data.get("peer_id")
        for_export = self.request.data.get("export")
        node_ids = self.request.data.get("node_ids")
        exclude_node_ids = self.request.data.get("exclude_node_ids")
        flag_count = sum(
            int(bool(flag)) for flag in (drive_id, peer_id, for_export))
        if flag_count > 1:
            raise ValidationError(
                "Must specify at most one of drive_id, peer_id, and export")
        # By default filter to unavailable files
        available = False
        if for_export:
            available = True
        try:
            (
                total_resource_count,
                files_to_download,
                total_bytes_to_transfer,
            ) = get_import_export_data(
                channel_id,
                node_ids,
                exclude_node_ids,
                available,
                drive_id=drive_id,
                peer_id=peer_id,
            )
        except LocationError:
            if drive_id:
                raise ValidationError(
                    "The external drive with given drive id {} does not exist."
                    .format(drive_id))
            if peer_id:
                raise ValidationError(
                    "The network location with the id {} does not exist".
                    format(peer_id))

        return Response({
            "resource_count": total_resource_count,
            "file_size": total_bytes_to_transfer,
        })
Ejemplo n.º 5
0
def delete_metadata(channel, node_ids, exclude_node_ids, force_delete):
    # Only delete all metadata if we are not doing selective deletion
    delete_all_metadata = not (node_ids or exclude_node_ids)

    if node_ids or exclude_node_ids:
        # If we have been passed node ids do not do a full deletion pass
        set_content_invisible(channel.id, node_ids, exclude_node_ids)
        # If everything has been made invisible, delete all the metadata
        delete_all_metadata = not channel.root.available

    if force_delete:
        # Do this before we delete all the metadata, as otherwise we lose
        # track of which local files were associated with the channel we
        # just deleted.
        _, unused_files, _ = get_import_export_data(
            channel.id,
            node_ids,
            exclude_node_ids,
            # Don't filter by availability as we have set nodes invisible
            # above, but the localfiles we are trying to delete are still
            # available
            None,
            renderable_only=False,
            topic_thumbnails=False,
        )

        with db_lock():
            propagate_forced_localfile_removal(unused_files)
        # Separate these operations as running the SQLAlchemy code in the latter
        # seems to cause the Django ORM interactions in the former to roll back
        # Not quite sure what is causing it, but presumably due to transaction
        # scopes.
        reannotate_all_channels()

    if delete_all_metadata:
        logger.info("Deleting all channel metadata")
        with db_lock():
            channel.delete_content_tree_and_files()

    # Clear any previously set channel availability stats for this channel
    clear_channel_stats(channel.id)

    return delete_all_metadata
Ejemplo n.º 6
0
 def test_no_exclude_duplicate_files(self):
     """
     Test that including a node id in exclude_node_ids does not
     exclude a shared file that is also used an in included node
     """
     root_node = ContentNode.objects.get(parent__isnull=True)
     node = ContentNode.objects.filter(
         parent=root_node, kind=content_kinds.TOPIC
     ).first()
     node1 = ContentNode.objects.create(
         title="test1",
         id=uuid.uuid4().hex,
         content_id=uuid.uuid4().hex,
         channel_id=root_node.channel_id,
         parent=node,
         kind=content_kinds.VIDEO,
         available=False,
     )
     node2 = ContentNode.objects.create(
         title="test2",
         id=uuid.uuid4().hex,
         content_id=uuid.uuid4().hex,
         channel_id=root_node.channel_id,
         parent=node,
         kind=content_kinds.VIDEO,
         available=False,
     )
     local_file = LocalFile.objects.create(
         id=uuid.uuid4().hex, extension="mp4", available=False, file_size=10
     )
     File.objects.create(
         id=uuid.uuid4().hex, local_file=local_file, contentnode=node1
     )
     File.objects.create(
         id=uuid.uuid4().hex, local_file=local_file, contentnode=node2
     )
     _, files_to_transfer, _ = get_import_export_data(
         root_node.channel_id, [node1.id], [node2.id], False, renderable_only=False
     )
     self.assertEqual(
         len(list(filter(lambda x: x.id == local_file.id, files_to_transfer))), 1
     )
Ejemplo n.º 7
0
    def handle_async(self, *args, **options):
        if paths.using_remote_storage():
            raise CommandError(
                "Cannot export files when using remote file storage")
        channel_id = options["channel_id"]
        data_dir = os.path.realpath(options["destination"])
        node_ids = options["node_ids"]
        exclude_node_ids = options["exclude_node_ids"]
        logger.info("Exporting content for channel id {} to {}".format(
            channel_id, data_dir))

        (
            total_resource_count,
            files,
            total_bytes_to_transfer,
        ) = get_import_export_data(channel_id, node_ids, exclude_node_ids,
                                   True)

        self.update_job_metadata(total_bytes_to_transfer, total_resource_count)

        exported_files = []

        with self.start_progress(
                total=total_bytes_to_transfer) as overall_progress_update:

            for f in files:

                if self.is_cancelled():
                    break

                dest = self.export_file(f, data_dir, overall_progress_update)
                if dest:
                    exported_files.append(dest)

            if self.is_cancelled():
                # Cancelled, clean up any already downloading files.
                for dest in exported_files:
                    os.remove(dest)
                self.cancel()
Ejemplo n.º 8
0
def delete_metadata(channel, node_ids, exclude_node_ids, force_delete):
    # Only delete all metadata if we are not doing selective deletion
    delete_all_metadata = not (node_ids or exclude_node_ids)

    if node_ids or exclude_node_ids:
        # If we have been passed node ids do not do a full deletion pass
        with db_task_write_lock:
            set_content_invisible(channel.id, node_ids, exclude_node_ids)
        # If everything has been made invisible, delete all the metadata
        delete_all_metadata = not channel.root.available

    if force_delete:
        # Do this before we delete all the metadata, as otherwise we lose
        # track of which local files were associated with the channel we
        # just deleted.
        _, unused_files, _ = get_import_export_data(
            channel.id,
            node_ids,
            exclude_node_ids,
            # Don't filter by availability as we have set nodes invisible
            # above, but the localfiles we are trying to delete are still
            # available
            None,
            renderable_only=False,
            topic_thumbnails=False,
        )

        with db_task_write_lock:
            propagate_forced_localfile_removal(unused_files)

    if delete_all_metadata:
        logger.info("Deleting all channel metadata")
        with db_task_write_lock:
            channel.delete_content_tree_and_files()

    # Clear any previously set channel availability stats for this channel
    clear_channel_stats(channel.id)

    return delete_all_metadata
Ejemplo n.º 9
0
    def _transfer(  # noqa: max-complexity=16
        self,
        method,
        channel_id,
        path=None,
        drive_id=None,
        node_ids=None,
        exclude_node_ids=None,
        baseurl=None,
        peer_id=None,
        renderable_only=True,
        import_updates=False,
    ):
        try:
            if not import_updates:
                (
                    total_resource_count,
                    files_to_download,
                    total_bytes_to_transfer,
                ) = get_import_export_data(
                    channel_id,
                    node_ids,
                    exclude_node_ids,
                    False,
                    renderable_only=renderable_only,
                    drive_id=drive_id,
                    peer_id=peer_id,
                )
            else:
                (
                    total_resource_count,
                    files_to_download,
                    total_bytes_to_transfer,
                ) = get_import_data_for_update(
                    channel_id,
                    renderable_only=renderable_only,
                    drive_id=drive_id,
                    peer_id=peer_id,
                )
        except LocationError:
            if drive_id:
                raise CommandError(
                    "The external drive with given drive id {} does not exist.".format(
                        drive_id
                    )
                )
            if peer_id:
                raise CommandError(
                    "The network location with the id {} does not exist".format(peer_id)
                )
        except ValueError:
            if import_updates:
                raise CommandError(
                    "Tried to perform an channel update import when update data was not available"
                )
            raise

        job = get_current_job()

        if job:
            job.extra_metadata["file_size"] = total_bytes_to_transfer
            job.extra_metadata["total_resources"] = total_resource_count
            job.save_meta()

        number_of_skipped_files = 0
        transferred_file_size = 0
        file_checksums_to_annotate = []
        public = None

        # If we're downloading, check listing status
        if method == DOWNLOAD_METHOD:
            public = lookup_channel_listing_status(
                channel_id=channel_id, baseurl=baseurl
            )

        resources_before_transfer = (
            ContentNode.objects.filter(channel_id=channel_id, available=True)
            .exclude(kind=content_kinds.TOPIC)
            .values("content_id")
            .distinct()
            .count()
        )

        dummy_bytes_for_annotation = annotation.calculate_dummy_progress_for_annotation(
            node_ids, exclude_node_ids, total_bytes_to_transfer
        )

        with self.start_progress(
            total=total_bytes_to_transfer + dummy_bytes_for_annotation
        ) as overall_progress_update:
            if method == DOWNLOAD_METHOD:
                session = requests.Session()

            file_transfers = []
            for f in files_to_download:

                if self.is_cancelled():
                    break

                filename = f.get_filename()
                try:
                    dest = paths.get_content_storage_file_path(filename)
                except InvalidStorageFilenameError:
                    # If the destination file name is malformed, just stop now.
                    overall_progress_update(f.file_size)
                    continue

                # if the file already exists, or we are using remote storage, add its size to our overall progress, and skip
                if paths.using_remote_storage() or (
                    os.path.isfile(dest) and os.path.getsize(dest) == f.file_size
                ):
                    overall_progress_update(f.file_size)
                    file_checksums_to_annotate.append(f.id)
                    transferred_file_size += f.file_size
                    continue

                # determine where we're downloading/copying from, and create appropriate transfer object
                if method == DOWNLOAD_METHOD:
                    url = paths.get_content_storage_remote_url(
                        filename, baseurl=baseurl
                    )
                    filetransfer = transfer.FileDownload(
                        url, dest, session=session, cancel_check=self.is_cancelled
                    )
                    file_transfers.append((f, filetransfer))
                elif method == COPY_METHOD:
                    try:
                        srcpath = paths.get_content_storage_file_path(
                            filename, datafolder=path
                        )
                    except InvalidStorageFilenameError:
                        # If the source file name is malformed, just stop now.
                        overall_progress_update(f.file_size)
                        continue
                    filetransfer = transfer.FileCopy(
                        srcpath, dest, cancel_check=self.is_cancelled
                    )
                    file_transfers.append((f, filetransfer))

            with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
                batch_size = 100
                # ThreadPoolExecutor allows us to download files concurrently,
                # greatly reducing download time in most cases. However, loading
                # all the downloads into the pool requires considerable memory,
                # so we divide the downloads into batches to keep memory usage down.
                # In batches of 100, total RAM usage doesn't exceed 250MB in testing.
                while len(file_transfers) > 0:
                    future_file_transfers = {}
                    for i in range(batch_size):
                        if len(file_transfers) > 0:
                            f, filetransfer = file_transfers.pop()
                            future = executor.submit(
                                self._start_file_transfer, f, filetransfer
                            )
                            future_file_transfers[future] = (f, filetransfer)

                    for future in concurrent.futures.as_completed(
                        future_file_transfers
                    ):
                        f, filetransfer = future_file_transfers[future]
                        try:
                            status, data_transferred = future.result()
                            overall_progress_update(data_transferred)
                            if self.is_cancelled():
                                break

                            if status == FILE_SKIPPED:
                                number_of_skipped_files += 1
                            else:
                                file_checksums_to_annotate.append(f.id)
                                transferred_file_size += f.file_size
                        except transfer.TransferCanceled:
                            break
                        except Exception as e:
                            logger.error(
                                "An error occurred during content import: {}".format(e)
                            )
                            if (
                                isinstance(e, requests.exceptions.HTTPError)
                                and e.response.status_code == 404
                            ) or (isinstance(e, OSError) and e.errno == 2):
                                # Continue file import when the current file is not found from the source and is skipped.
                                overall_progress_update(f.file_size)
                                number_of_skipped_files += 1
                                continue
                            else:
                                self.exception = e
                                break

            annotation.set_content_visibility(
                channel_id,
                file_checksums_to_annotate,
                node_ids=node_ids,
                exclude_node_ids=exclude_node_ids,
                public=public,
            )

            resources_after_transfer = (
                ContentNode.objects.filter(channel_id=channel_id, available=True)
                .exclude(kind=content_kinds.TOPIC)
                .values("content_id")
                .distinct()
                .count()
            )

            if job:
                job.extra_metadata["transferred_file_size"] = transferred_file_size
                job.extra_metadata["transferred_resources"] = (
                    resources_after_transfer - resources_before_transfer
                )
                job.save_meta()

            if number_of_skipped_files > 0:
                logger.warning(
                    "{} files are skipped, because errors occurred during the import.".format(
                        number_of_skipped_files
                    )
                )

            overall_progress_update(dummy_bytes_for_annotation)

            if self.exception:
                raise self.exception

            if self.is_cancelled():
                self.cancel()