Example #1
0
    def test_main_delays(self, monkeypatch, event_loop, credentials, settings):
        task = mock.Mock()
        monkeypatch.setattr(parity, '_parity_create_files', task)

        fut = parity.main('The Best', credentials, settings)
        event_loop.run_until_complete(fut)

        task.delay.assert_called_once_with('The Best', credentials, settings)
    def test_main_delays(self, monkeypatch, event_loop, credentials, settings):
        task = mock.Mock()
        monkeypatch.setattr(parity, '_parity_create_files', task)

        fut = parity.main('The Best', credentials, settings)
        event_loop.run_until_complete(fut)

        task.delay.assert_called_once_with('The Best', credentials, settings)
Example #3
0
    async def upload(self, stream, path, **kwargs):
        """Upload a new file to osfstorage

        When a file is uploaded to osfstorage, WB does a bit of a dance to make sure it gets there
        reliably.  First we take the stream and add several hash calculators that can determine the
        hash of the file as it streams through.  We then tee the file so that it's written to a
        "pending" directory on both local disk and the remote storage provider.  Once that's
        complete, we determine the file's final location, which will be in another directory (by
        default called 'complete'), and renamed to its sha256 hash.   We then check to see if a
        file already exists at that path on the remote storage provider.  If it does, we can skip
        moving the file (since its already been uploaded) and instead delete the pending file. If
        it does not, we move the file on the remote storage provider from the pending path to its
        final path.

        Once this is done the local copy of the file is moved from the pending directory to the
        complete directory.  The file metadata is sent back to the metadata provider to be recorded.
        Finally, we schedule two futures to archive the locally complete file.  One copies the file
        into Amazon Glacier, the other calculates a parity archive, so that the file can be
        reconstructed if any on-disk corruption happens.  These tasks are scheduled via celery and
        don't need to complete for the request to finish.

        Finally, WB constructs its metadata response and sends that back to the original request
        issuer.

        The local file sitting in complete will be archived by the celery tasks at some point in
        the future.  The archivers do not signal when they have finished their task, so for the time
        being the local complete files are allowed to accumulate and must be deleted by some
        external process.  COS currently uses a cron job to delete files older than X days.  If the
        system is being heavily used, it's possible that the files may be deleted before the
        archivers are able to run.  To get around this we have another script in the osf.io
        repository that can audit our files on the remote storage and initiate any missing archives.

        """
        self._create_paths()

        pending_name = str(uuid.uuid4())
        provider = self.make_provider(self.settings)
        local_pending_path = os.path.join(settings.FILE_PATH_PENDING,
                                          pending_name)
        remote_pending_path = await provider.validate_path('/' + pending_name)
        logger.debug(
            'upload: local_pending_path::{}'.format(local_pending_path))
        logger.debug(
            'upload: remote_pending_path::{}'.format(remote_pending_path))

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))

        try:
            with open(local_pending_path, 'wb') as file_pointer:
                stream.add_writer('file', file_pointer)
                await provider.upload(stream,
                                      remote_pending_path,
                                      check_created=False,
                                      fetch_metadata=False,
                                      **kwargs)
        except Exception as exc:
            # If we fail to upload to the remote storage provider, then delete the copy of the file
            # from the local provider, too.  The user will have to reupload the file to local
            # anyway, and this will avoid filling up the local disk with unused pending files.
            try:
                os.remove(local_pending_path)
            except OSError as os_exc:
                raise exceptions.UploadFailedError(
                    'Upload failed, please try again.') from os_exc
            raise exceptions.UploadFailedError(
                'Upload failed, please try again.') from exc

        complete_name = stream.writers['sha256'].hexdigest
        local_complete_path = os.path.join(settings.FILE_PATH_COMPLETE,
                                           complete_name)
        remote_complete_path = await provider.validate_path('/' +
                                                            complete_name)

        try:
            metadata = await provider.metadata(remote_complete_path)
        except exceptions.MetadataError as e:
            if e.code != 404:
                raise
            metadata, _ = await provider.move(provider, remote_pending_path,
                                              remote_complete_path)
        else:
            await provider.delete(remote_pending_path)

        metadata = metadata.serialized()

        # Due to cross volume movement in unix we leverage shutil.move which properly handles this case.
        # http://bytes.com/topic/python/answers/41652-errno-18-invalid-cross-device-link-using-os-rename#post157964
        shutil.move(local_pending_path, local_complete_path)

        async with self.signed_request(
                'POST',
                self.build_url(path.parent.identifier, 'children'),
                expects=(200, 201),
                data=json.dumps({
                    'name': path.name,
                    'user': self.auth['id'],
                    'settings': self.settings['storage'],
                    'metadata': metadata,
                    'hashes': {
                        'md5': stream.writers['md5'].hexdigest,
                        'sha1': stream.writers['sha1'].hexdigest,
                        'sha256': stream.writers['sha256'].hexdigest,
                    },
                    'worker': {
                        'host': os.uname()[1],
                        # TODO: Include additional information
                        'address': None,
                        'version': self.__version__,
                    },
                }),
                headers={'Content-Type': 'application/json'},
        ) as response:
            created = response.status == 201
            data = await response.json()

        if settings.RUN_TASKS and data.pop('archive', True):
            parity.main(
                local_complete_path,
                data['version'],
                self.build_url('hooks', 'metadata') + '/',
                self.parity_credentials,
                self.parity_settings,
            )
            backup.main(
                local_complete_path,
                data['version'],
                self.build_url('hooks', 'metadata') + '/',
                self.archive_credentials,
                self.archive_settings,
            )

        name = path.name

        metadata.update({
            'name':
            name,
            'md5':
            data['data']['md5'],
            'path':
            data['data']['path'],
            'sha256':
            data['data']['sha256'],
            'version':
            data['data']['version'],
            'downloads':
            data['data']['downloads'],
            'checkout':
            data['data']['checkout'],
            'modified':
            data['data']['modified'],
            'modified_utc':
            utils.normalize_datetime(data['data']['modified']),
        })

        path._parts[-1]._id = metadata['path'].strip('/')
        return OsfStorageFileMetadata(metadata, str(path)), created
Example #4
0
    def upload(self, stream, path, **kwargs):
        self._create_paths()

        pending_name = str(uuid.uuid4())
        provider = self.make_provider(self.settings)
        local_pending_path = os.path.join(settings.FILE_PATH_PENDING, pending_name)
        remote_pending_path = yield from provider.validate_path('/' + pending_name)

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))

        with open(local_pending_path, 'wb') as file_pointer:
            stream.add_writer('file', file_pointer)
            yield from provider.upload(stream, remote_pending_path, check_created=False, fetch_metadata=False, **kwargs)

        complete_name = stream.writers['sha256'].hexdigest
        local_complete_path = os.path.join(settings.FILE_PATH_COMPLETE, complete_name)
        remote_complete_path = yield from provider.validate_path('/' + complete_name)

        try:
            metadata = yield from provider.metadata(remote_complete_path)
        except exceptions.MetadataError as e:
            if e.code != 404:
                raise
            metadata, _ = yield from provider.move(provider, remote_pending_path, remote_complete_path)
        else:
            yield from provider.delete(remote_pending_path)
        finally:
            metadata = metadata.serialized()

        # Due to cross volume movement in unix we leverage shutil.move which properly handles this case.
        # http://bytes.com/topic/python/answers/41652-errno-18-invalid-cross-device-link-using-os-rename#post157964
        shutil.move(local_pending_path, local_complete_path)

        response = yield from self.make_signed_request(
            'POST',
            self.build_url(path.parent.identifier, 'children'),
            expects=(200, 201),
            data=json.dumps({
                'name': path.name,
                'user': self.auth['id'],
                'settings': self.settings['storage'],
                'metadata': metadata,
                'hashes': {
                    'md5': stream.writers['md5'].hexdigest,
                    'sha1': stream.writers['sha1'].hexdigest,
                    'sha256': stream.writers['sha256'].hexdigest,
                },
                'worker': {
                    'host': os.uname()[1],
                    # TODO: Include additional information
                    'address': None,
                    'version': self.__version__,
                },
            }),
            headers={'Content-Type': 'application/json'},
        )

        created = response.status == 201
        data = yield from response.json()

        if settings.RUN_TASKS and data.pop('archive', True):
            parity.main(
                local_complete_path,
                self.parity_credentials,
                self.parity_settings,
            )
            backup.main(
                local_complete_path,
                data['version'],
                self.build_url('hooks', 'metadata') + '/',
                self.archive_credentials,
                self.archive_settings,
            )

        name = path.name

        metadata.update({
            'name': name,
            'md5': data['data']['md5'],
            'path': data['data']['path'],
            'sha256': data['data']['sha256'],
            'version': data['data']['version'],
            'downloads': data['data']['downloads'],
            'checkout': data['data']['checkout'],
        })

        path._parts[-1]._id = metadata['path'].strip('/')
        return OsfStorageFileMetadata(metadata, str(path)), created
Example #5
0
    def upload(self, stream, path, **kwargs):
        self._create_paths()

        pending_name = str(uuid.uuid4())
        provider = self.make_provider(self.settings)
        local_pending_path = os.path.join(settings.FILE_PATH_PENDING, pending_name)
        remote_pending_path = yield from provider.validate_path('/' + pending_name)

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))

        with open(local_pending_path, 'wb') as file_pointer:
            stream.add_writer('file', file_pointer)
            yield from provider.upload(stream, remote_pending_path, check_created=False, fetch_metadata=False, **kwargs)

        complete_name = stream.writers['sha256'].hexdigest
        local_complete_path = os.path.join(settings.FILE_PATH_COMPLETE, complete_name)
        remote_complete_path = yield from provider.validate_path('/' + complete_name)

        try:
            metadata = yield from provider.metadata(remote_complete_path)
        except exceptions.MetadataError as e:
            if e.code != 404:
                raise
            metadata, _ = yield from provider.move(provider, remote_pending_path, remote_complete_path)
        else:
            yield from provider.delete(remote_pending_path)
        finally:
            metadata = metadata.serialized()

        # Due to cross volume movement in unix we leverage shutil.move which properly handles this case.
        # http://bytes.com/topic/python/answers/41652-errno-18-invalid-cross-device-link-using-os-rename#post157964
        shutil.move(local_pending_path, local_complete_path)

        response = yield from self.make_signed_request(
            'POST',
            self.build_url(path.parent.identifier, 'children'),
            expects=(200, 201),
            data=json.dumps({
                'name': path.name,
                'user': self.auth['id'],
                'settings': self.settings['storage'],
                'metadata': metadata,
                'hashes': {
                    'md5': stream.writers['md5'].hexdigest,
                    'sha1': stream.writers['sha1'].hexdigest,
                    'sha256': stream.writers['sha256'].hexdigest,
                },
                'worker': {
                    'host': os.uname()[1],
                    # TODO: Include additional information
                    'address': None,
                    'version': self.__version__,
                },
            }),
            headers={'Content-Type': 'application/json'},
        )

        created = response.status == 201
        data = yield from response.json()

        if settings.RUN_TASKS and data.pop('archive', True):
            parity.main(
                local_complete_path,
                self.parity_credentials,
                self.parity_settings,
            )
            backup.main(
                local_complete_path,
                data['version'],
                self.build_url('hooks', 'metadata') + '/',
                self.archive_credentials,
                self.archive_settings,
            )

        name = path.name

        metadata.update({
            'name': name,
            'path': data['data']['path'],
            'version': data['data']['version'],
            'downloads': data['data']['downloads']
        })

        return OsfStorageFileMetadata(metadata, str(path)), created
Example #6
0
    def upload(self, stream, path, **kwargs):
        self._create_paths()

        pending_name = str(uuid.uuid4())
        provider = self.make_provider(self.settings)
        local_pending_path = os.path.join(settings.FILE_PATH_PENDING, pending_name)
        remote_pending_path = yield from provider.validate_path("/" + pending_name)

        stream.add_writer("md5", streams.HashStreamWriter(hashlib.md5))
        stream.add_writer("sha1", streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer("sha256", streams.HashStreamWriter(hashlib.sha256))

        with open(local_pending_path, "wb") as file_pointer:
            stream.add_writer("file", file_pointer)
            yield from provider.upload(stream, remote_pending_path, check_created=False, fetch_metadata=False, **kwargs)

        complete_name = stream.writers["sha256"].hexdigest
        local_complete_path = os.path.join(settings.FILE_PATH_COMPLETE, complete_name)
        remote_complete_path = yield from provider.validate_path("/" + complete_name)

        try:
            metadata = yield from provider.metadata(remote_complete_path)
        except exceptions.MetadataError as e:
            if e.code != 404:
                raise
            metadata, _ = yield from provider.move(provider, remote_pending_path, remote_complete_path)
        else:
            yield from provider.delete(remote_pending_path)
        finally:
            metadata = metadata.serialized()

        # Due to cross volume movement in unix we leverage shutil.move which properly handles this case.
        # http://bytes.com/topic/python/answers/41652-errno-18-invalid-cross-device-link-using-os-rename#post157964
        shutil.move(local_pending_path, local_complete_path)

        response = yield from self.make_signed_request(
            "POST",
            self.build_url(path.parent.identifier, "children"),
            expects=(200, 201),
            data=json.dumps(
                {
                    "name": path.name,
                    "user": self.auth["id"],
                    "settings": self.settings["storage"],
                    "metadata": metadata,
                    "hashes": {
                        "md5": stream.writers["md5"].hexdigest,
                        "sha1": stream.writers["sha1"].hexdigest,
                        "sha256": stream.writers["sha256"].hexdigest,
                    },
                    "worker": {
                        "host": os.uname()[1],
                        # TODO: Include additional information
                        "address": None,
                        "version": self.__version__,
                    },
                }
            ),
            headers={"Content-Type": "application/json"},
        )

        created = response.status == 201
        data = yield from response.json()

        if settings.RUN_TASKS and data.pop("archive", True):
            parity.main(local_complete_path, self.parity_credentials, self.parity_settings)
            backup.main(
                local_complete_path,
                data["version"],
                self.build_url("hooks", "metadata") + "/",
                self.archive_credentials,
                self.archive_settings,
            )

        name = path.name

        metadata.update(
            {
                "name": name,
                "md5": data["data"]["md5"],
                "path": data["data"]["path"],
                "sha256": data["data"]["sha256"],
                "version": data["data"]["version"],
                "downloads": data["data"]["downloads"],
                "checkout": data["data"]["checkout"],
            }
        )

        path._parts[-1]._id = metadata["path"].strip("/")
        return OsfStorageFileMetadata(metadata, str(path)), created
Example #7
0
    def upload(self, stream, path, **kwargs):
        self._create_paths()

        pending_name = str(uuid.uuid4())
        pending_path = os.path.join(settings.FILE_PATH_PENDING, pending_name)

        pending_name = OSFPath('/' + pending_name).path

        stream.add_writer('md5', streams.HashStreamWriter(hashlib.md5))
        stream.add_writer('sha1', streams.HashStreamWriter(hashlib.sha1))
        stream.add_writer('sha256', streams.HashStreamWriter(hashlib.sha256))

        with open(pending_path, 'wb') as file_pointer:
            stream.add_writer('file', file_pointer)
            provider = self.make_provider(self.settings)
            yield from provider.upload(stream, pending_name, check_created=False, fetch_metadata=False, **kwargs)

        complete_name = stream.writers['sha256'].hexdigest
        complete_path = os.path.join(settings.FILE_PATH_COMPLETE, complete_name)

        complete_name = OSFPath('/' + complete_name).path

        try:
            metadata = yield from provider.metadata(complete_name)
        except exceptions.ProviderError:
            metadata = yield from provider.move(
                provider,
                {'path': pending_name},
                {'path': complete_name},
            )
        else:
            yield from provider.delete(pending_name)

        # Due to cross volume movement in unix we leverage shutil.move which properly handles this case.
        # http://bytes.com/topic/python/answers/41652-errno-18-invalid-cross-device-link-using-os-rename#post157964
        shutil.move(pending_path, complete_path)

        response = yield from self.make_signed_request(
            'POST',
            self.callback_url,
            expects=(200, 201),
            data=json.dumps({
                'auth': self.auth,
                'settings': self.settings['storage'],
                'metadata': metadata,
                'hashes': {
                    'md5': stream.writers['md5'].hexdigest,
                    'sha1': stream.writers['sha1'].hexdigest,
                    'sha256': stream.writers['sha256'].hexdigest,
                },
                'worker': {
                    'host': os.uname()[1],
                    # TODO: Include additional information
                    'address': None,
                    'version': self.__version__,
                },
                'path': path,
            }),
            headers={'Content-Type': 'application/json'},
        )

        created = response.status == 201
        data = yield from response.json()

        if settings.RUN_TASKS:
            version_id = data['version']
            parity.main(
                complete_path,
                self.parity_credentials,
                self.parity_settings,
            )
            backup.main(
                complete_path,
                version_id,
                self.callback_url,
                self.archive_credentials,
                self.archive_settings,
            )

        _, name = os.path.split(path)

        metadata.update({
            'name': name,
            'path': data['path'],
            'version': data['version'],
            'downloads': data['downloads']
        })

        return OsfStorageFileMetadata(metadata).serialized(), created