Ejemplo n.º 1
0
 def move_from_storage_service(self,
                               source_path,
                               destination_path,
                               package=None):
     """ Moves self.staging_path/src_path to dest_path. """
     if os.path.isdir(source_path):
         # Both source and destination paths should end with /
         destination_path = os.path.join(destination_path, '')
         # Swift does not accept folders, so upload each file individually
         for path, dirs, files in os.walk(source_path):
             for basename in files:
                 entry = os.path.join(path, basename)
                 dest = entry.replace(source_path, destination_path, 1)
                 checksum = utils.generate_checksum(entry)
                 with open(entry, 'rb') as f:
                     self.connection.put_object(
                         self.container,
                         obj=dest,
                         contents=f,
                         etag=checksum.hexdigest(),
                         content_length=os.path.getsize(entry))
     elif os.path.isfile(source_path):
         checksum = utils.generate_checksum(source_path)
         with open(source_path, 'rb') as f:
             self.connection.put_object(
                 self.container,
                 obj=destination_path,
                 contents=f,
                 etag=checksum.hexdigest(),
                 content_length=os.path.getsize(source_path),
             )
     else:
         raise StorageException(
             _('%(path)s is neither a file nor a directory, may not exist')
             % {'path': source_path})
Ejemplo n.º 2
0
    def post_move_from_storage_service(self, staging_path, destination_path,
                                       package):
        """ POST to Arkivum with information about the newly stored Package. """
        if package is None:
            return

        relative_path = os.path.relpath(destination_path, self.space.path)
        if package.is_compressed:  # Single-file package
            url = 'https://' + self.host + '/api/2/files/release/' + relative_path
            headers = {'Content-Type': 'application/json'}

            # Get size, checksum, checksum algorithm (md5sum), compression algorithm
            checksum = utils.generate_checksum(staging_path, 'md5')
            payload = {
                'size': str(os.path.getsize(staging_path)),
                'checksum': checksum.hexdigest(),
                'checksumAlgorithm': 'md5',
                'compressionAlgorithm':
                os.path.splitext(package.current_path)[1],
            }
            payload = json.dumps(payload)
            files = None
        else:  # uncompressed bag
            url = 'https://' + self.host + '/api/3/ingest-manifest/release'
            headers = None
            # FIXME Destination path has to exclude mount path, but what is part of the mounth path? Let's pretend it's the Space path
            payload = {'bagitPath': os.path.join('/', relative_path)}
            files = {'': ('', '')}

        LOGGER.debug('POST URL: %s; Header: %s; Payload: %s; Files: %s', url,
                     headers, payload, files)
        try:
            response = requests.post(url,
                                     headers=headers,
                                     data=payload,
                                     files=files,
                                     verify=VERIFY)
        except requests.exceptions.ConnectionError:
            LOGGER.exception('Error in connection for POST to %s', url)
            raise StorageException('Error in connection for POST to %s', url)

        LOGGER.debug('Response: %s, Response text: %s', response.status_code,
                     response.text)
        if response.status_code not in (requests.codes.ok,
                                        requests.codes.accepted):
            LOGGER.warning('Arkivum responded with %s: %s',
                           response.status_code, response.text)
            raise StorageException('Unable to notify Arkivum of %s', package)
        # Response has request ID for polling status
        try:
            response_json = response.json()
        except json.JSONDecodeError:
            raise StorageException(
                "Could not get request ID from Arkivum's response %s",
                response.text)

        # Store request ID in misc_attributes
        request_id = response_json['id']
        package.misc_attributes.update({'arkivum_identifier': request_id})
        package.save()
Ejemplo n.º 3
0
    def _download_file(self, remote_path, download_path):
        """
        Download the file from download_path in this Space to remote_path.

        :param str remote_path: Full path in Swift
        :param str download_path: Full path to save the file to
        :raises: swiftclient.exceptions.ClientException may be raised and is not caught
        """
        # TODO find a way to stream content to dest_path, instead of having to put it in memory
        headers, content = self.connection.get_object(self.container,
                                                      remote_path)
        self.space.create_local_directory(download_path)
        with open(download_path, 'wb') as f:
            f.write(content)
        # Check ETag matches checksum of this file
        if 'etag' in headers:
            checksum = utils.generate_checksum(download_path)
            if checksum.hexdigest() != headers['etag']:
                message = _(
                    'ETag %(remote_path)s for %(etag)s does not match %(checksum)s'
                ) % {
                    'remote_path': remote_path,
                    'etag': headers['etag'],
                    'checksum': checksum.hexdigest()
                }
                logging.warning(message)
                raise StorageException(message)
    def post_move_from_storage_service(self, staging_path, destination_path,
                                       package):
        """ POST to Arkivum with information about the newly stored Package. """
        if package is None:
            return

        # Get size, checksum, checksum algorithm (md5sum), compression algorithm
        checksum = utils.generate_checksum(staging_path, 'md5')
        payload = {
            'size': str(os.path.getsize(staging_path)),
            'checksum': checksum.hexdigest(),
            'checksumAlgorithm': 'md5',
            'compressionAlgorithm': os.path.splitext(package.current_path)[1],
        }
        payload = json.dumps(payload)

        # POST to Arkivum host/api/2/files/release/relative_path
        relative_path = os.path.relpath(destination_path, self.space.path)
        url = 'https://' + self.host + '/api/2/files/release/' + relative_path
        LOGGER.info('URL: %s, Payload: %s', url, payload)

        try:
            response = requests.post(
                url,
                headers={'Content-Type': 'application/json'},
                data=payload,
                verify=VERIFY)
        except requests.exceptions.ConnectionError:
            LOGGER.exception('Error in connection for POST to %s', url)
            raise StorageException('Error in connection for POST to %s', url)

        LOGGER.info('Response: %s, Response text: %s', response.status_code,
                    response.text)
        if response.status_code not in (requests.codes.ok,
                                        requests.codes.accepted):
            LOGGER.warning('Arkivum responded with %s: %s',
                           response.status_code, response.text)
            raise StorageException('Unable to notify Arkivum of %s', package)
        # Response has request ID for polling status
        try:
            response_json = response.json()
        except json.JSONDecodeError:
            raise StorageException(
                "Could not get request ID from Arkivum's response %s",
                response.text)
        request_id = response_json['id']

        # Store request ID in misc_attributes
        package.misc_attributes.update({'request_id': request_id})
        package.save()
def _fetch_content(deposit_uuid, objects, subdirs=None):
    """
    Download a number of files, keeping track of progress and success using a
    database record. After downloading, finalize deposit if requested.

    If subdirs is provided, the file will be moved into a subdirectory of the
    new transfer; otherwise, it will be placed in the transfer's root.
    """
    # add download task to keep track of progress
    deposit = get_deposit(deposit_uuid)
    task = models.PackageDownloadTask(package=deposit)
    task.downloads_attempted = len(objects)
    task.downloads_completed = 0
    task.save()

    # Get deposit protocol info
    deposit_space = deposit.current_location.space.get_child_space()
    fedora_username = getattr(deposit_space, 'fedora_user', None)
    fedora_password = getattr(deposit_space, 'fedora_password', None)

    # download the files
    temp_dir = tempfile.mkdtemp()
    completed = 0
    for item in objects:
        # create download task file record
        task_file = models.PackageDownloadTaskFile(task=task)
        task_file.save()

        try:
            filename = item['filename']

            task_file.filename = filename
            task_file.url = item['url']
            task_file.save()

            download_resource(url=item['url'],
                              destination_path=temp_dir,
                              filename=filename,
                              username=fedora_username,
                              password=fedora_password)

            temp_filename = os.path.join(temp_dir, filename)

            if item['checksum'] is not None and item[
                    'checksum'] != generate_checksum(temp_filename,
                                                     'md5').hexdigest():
                os.unlink(temp_filename)
                raise Exception("Incorrect checksum")

            # Some MODS records have no proper filenames
            if filename == 'MODS Record':
                filename = item['object_id'].replace(':', '-') + '-MODS.xml'

            if subdirs:
                base_path = os.path.join(deposit.full_path, *subdirs)
            else:
                base_path = deposit.full_path

            new_path = os.path.join(base_path, filename)
            shutil.move(temp_filename, new_path)

            # mark download task file record complete or failed
            task_file.completed = True
            task_file.save()

            LOGGER.info('Saved file to ' + new_path)
            completed += 1

            file_record = models.File(name=item['filename'],
                                      source_id=item['object_id'],
                                      checksum=generate_checksum(
                                          new_path, 'sha512').hexdigest())
            file_record.save()
        except Exception as e:
            LOGGER.error('Package download task encountered an error:' +
                         str(e))
            # an error occurred
            task_file.failed = True
            task_file.save()

    # remove temp dir
    shutil.rmtree(temp_dir)

    # record the number of successful downloads and completion time
    task.downloads_completed = completed
    task.download_completion_time = timezone.now()
    task.save()

    # if the deposit is ready for finalization and this is the last batch
    # download to complete, then finalize
    ready_for_finalization = deposit.misc_attributes.get(
        'ready_for_finalization', False)
    if ready_for_finalization and deposit_downloading_status(
            deposit) == models.PackageDownloadTask.COMPLETE:
        _finalize_if_not_empty(deposit_uuid)
Ejemplo n.º 6
0
    def _create_resource(self, package, output_files):
        """ Given a package, create an Atom resource entry to send to LOCKSS.

        Parses metadata for the Atom entry from the METS file, uses
        LOCKSS-o-matic-specific tags to describe size and checksums.
        """

        # Parse METS to get information for atom entry
        relative_mets_path = os.path.join(
            os.path.splitext(os.path.basename(package.current_path))[0],
            "data",
            'METS.{}.xml'.format(package.uuid))
        (mets_path, temp_dir) = package.extract_file(relative_mets_path)
        mets = etree.parse(mets_path)
        # Delete temp dir if created
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)

        # Parse out name and description if found
        slug = str(package.uuid)
        title = os.path.basename(package.current_path)
        summary = 'AIP generated by Archivematica with uuid {}'.format(package.uuid)
        dublincore = mets.find('mets:dmdSec/mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore', namespaces=utils.NSMAP)
        if dublincore is not None:
            title = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=title)
            slug = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=slug)
            summary = dublincore.findtext('dcterms:description', namespaces=utils.NSMAP, default=summary)
        # Parse out Agent for author
        authors = mets.xpath(".//mets:mdWrap[@MDTYPE='PREMIS:AGENT']//mets:agentType[text()='organization']/ancestor::mets:agent/*/mets:agentIdentifierValue", namespaces=utils.NSMAP)
        author = authors[0].text if authors else None

        # Create atom entry
        entry = sword2.Entry(
            title=title,
            id='urn:uuid:' + package.uuid,
            author={'name': author},
            summary=summary)

        # Add each chunk to the atom entry
        if not self.pointer_root:
            self.pointer_root = etree.parse(package.full_pointer_file_path)
        entry.register_namespace('lom', utils.NSMAP['lom'])
        for index, file_path in enumerate(output_files):
            # Get external URL
            if len(output_files) == 1:
                external_url = self._download_url(package.uuid)
            else:
                external_url = self._download_url(package.uuid, index + 1)

            # Get checksum and size from pointer file (or generate if not found)
            file_e = self.pointer_root.find(".//mets:fileGrp[@USE='LOCKSS chunk']/mets:file[@ID='{}']".format(os.path.basename(file_path)), namespaces=utils.NSMAP)
            if file_e is not None:
                checksum_name = file_e.get('CHECKSUMTYPE')
                checksum_value = file_e.get('CHECKSUM')
                size = int(file_e.get('SIZE'))
            else:
                # Not split, generate
                try:
                    checksum = utils.generate_checksum(file_path,
                        self.checksum_type)
                except ValueError:  # Invalid checksum type
                    checksum = utils.generate_checksum(file_path, 'md5')
                checksum_name = checksum.name.upper().replace('SHA', 'SHA-')
                checksum_value = checksum.hexdigest()
                size = os.path.getsize(file_path)

            # Convert size to kB
            size = str(math.ceil(size / 1000))

            # Add new content entry and values
            entry.add_field('lom_content', external_url)
            content_entry = entry.entry[-1]
            content_entry.set('size', size)
            content_entry.set('checksumType', checksum_name)
            content_entry.set('checksumValue', checksum_value)

        LOGGER.debug('LOCKSS atom entry: %s', entry)
        return entry, slug
Ejemplo n.º 7
0
    def _split_package(self, package):
        """
        Splits the package into chunks of size self.au_size. Returns list of paths to the chunks.

        If the package has already been split (and an event is in the pointer
        file), returns the list if file paths from the pointer file.

        Updates the pointer file with the new LOCKSS chunks, and adds 'division'
        event.
        """
        # Parse pointer file
        if not self.pointer_root:
            self.pointer_root = etree.parse(package.full_pointer_file_path)

        # Check if file is already split, and if so just return split files
        if self.pointer_root.xpath('.//premis:eventType[text()="division"]', namespaces=utils.NSMAP):
            chunks = self.pointer_root.findall(".//mets:div[@TYPE='Archival Information Package']/mets:div[@TYPE='LOCKSS chunk']", namespaces=utils.NSMAP)
            output_files = [c.find('mets:fptr', namespaces=utils.NSMAP).get('FILEID') for c in chunks]
            return output_files

        file_path = package.full_path
        expected_num_files = math.ceil(os.path.getsize(file_path) / self.au_size)
        LOGGER.debug('expected_num_files: %s', expected_num_files)

        # No split needed - just return the file path
        if expected_num_files <= 1:
            LOGGER.debug('Only one file expected, not splitting')
            output_files = [file_path]
            # No events or structMap changes needed
            LOGGER.info('LOCKSS: after splitting: %s', output_files)
            return output_files

        # Split file
        # Strip extension, add .tar-1 ('-1' to make rename script happy)
        output_path = os.path.splitext(file_path)[0] + '.tar-1'
        command = ['tar', '--create', '--multi-volume',
            '--tape-length', str(self.au_size),
            '--new-volume-script', 'common/tar_new_volume.sh',
            '-f', output_path, file_path]
        # TODO reserve space in quota for extra files
        LOGGER.info('LOCKSS split command: %s', command)
        try:
            subprocess.check_call(command)
        except Exception:
            LOGGER.exception("Split of %s failed with command %s", file_path, command)
            raise
        output_path = output_path[:-2]  # Remove '-1'
        dirname, basename = os.path.split(output_path)
        output_files = sorted([os.path.join(dirname, entry) for entry in os.listdir(dirname) if entry.startswith(basename)])

        # Update pointer file
        amdsec = self.pointer_root.find('mets:amdSec', namespaces=utils.NSMAP)

        # Add 'division' PREMIS:EVENT
        try:
            event_detail = subprocess.check_output(['tar', '--version'])
        except subprocess.CalledProcessError as e:
            event_detail = e.output or _('Error: getting tool info; probably GNU tar')
        utils.mets_add_event(
            amdsec,
            event_type='division',
            event_detail=event_detail,
            event_outcome_detail_note='{} LOCKSS chunks created'.format(len(output_files)),
        )

        # Update structMap & fileSec
        self.pointer_root.find('mets:structMap', namespaces=utils.NSMAP).set('TYPE', 'logical')
        aip_div = self.pointer_root.find("mets:structMap/mets:div[@TYPE='Archival Information Package']", namespaces=utils.NSMAP)
        filesec = self.pointer_root.find('mets:fileSec', namespaces=utils.NSMAP)
        filegrp = etree.SubElement(filesec, utils.PREFIX_NS['mets'] + 'fileGrp', USE='LOCKSS chunk')

        # Move ftpr to Local copy div
        local_ftpr = aip_div.find('mets:fptr', namespaces=utils.NSMAP)
        if local_ftpr is not None:
            div = etree.SubElement(aip_div, utils.PREFIX_NS['mets'] + 'div', TYPE='Local copy')
            div.append(local_ftpr)  # This moves local_fptr

        # Add each split chunk to structMap & fileSec
        for idx, out_path in enumerate(output_files):
            # Add div to structMap
            div = etree.SubElement(aip_div, utils.PREFIX_NS['mets'] + 'div', TYPE='LOCKSS chunk', ORDER=str(idx + 1))
            etree.SubElement(div, utils.PREFIX_NS['mets'] + 'fptr', FILEID=os.path.basename(out_path))
            # Get checksum and size for fileSec
            try:
                checksum = utils.generate_checksum(out_path, self.checksum_type)
            except ValueError:  # Invalid checksum type
                checksum = utils.generate_checksum(out_path, 'md5')
            checksum_name = checksum.name.upper().replace('SHA', 'SHA-')
            size = os.path.getsize(out_path)
            # Add file & FLocat to fileSec
            file_e = etree.SubElement(filegrp, utils.PREFIX_NS['mets'] + 'file',
                ID=os.path.basename(out_path), SIZE=str(size),
                CHECKSUM=checksum.hexdigest(), CHECKSUMTYPE=checksum_name)
            flocat = etree.SubElement(file_e, utils.PREFIX_NS['mets'] + 'FLocat', OTHERLOCTYPE="SYSTEM", LOCTYPE="OTHER")
            flocat.set(utils.NSMAP['xlink'] + 'href', out_path)

        # Write out pointer file again
        with open(package.full_pointer_file_path, 'w') as f:
            f.write(etree.tostring(self.pointer_root, pretty_print=True, xml_declaration=True, encoding='utf-8'))

        return output_files
Ejemplo n.º 8
0
    def _upload_file(self, url, upload_file, resume=False):
        """
        Upload a file of any size to Duracloud.

        If the file is larger that self.CHUNK_SIZE, will chunk it and upload chunks and manifest.

        :param url: URL to upload the file to.
        :param upload_file: Absolute path to the file to upload.
        :returns: None
        :raises: StorageException if error storing file
        """
        LOGGER.debug('Upload %s to %s', upload_file, url)
        filesize = os.path.getsize(upload_file)
        if filesize > self.CHUNK_SIZE:
            LOGGER.debug('%s size (%s) larger than %s', upload_file, filesize,
                         self.CHUNK_SIZE)
            # Create manifest info for complete file.  Eg:
            # <header schemaVersion="0.2">
            #   <sourceContent contentId="chunked/chunked_image.jpg">
            #     <mimetype>application/octet-stream</mimetype>
            #     <byteSize>2222135</byteSize>
            #     <md5>9497f70a1a17943ddfcbed567538900d</md5>
            #   </sourceContent>
            # </header>
            relative_path = urllib.unquote(
                url.replace(self.duraspace_url, '', 1))
            LOGGER.debug('File name: %s', relative_path)
            checksum = utils.generate_checksum(upload_file, 'md5')
            LOGGER.debug('Checksum for %s: %s', upload_file,
                         checksum.hexdigest())
            root = etree.Element('{duracloud.org}chunksManifest',
                                 nsmap={'dur': 'duracloud.org'})
            header = etree.SubElement(root, 'header', schemaVersion="0.2")
            content = etree.SubElement(header,
                                       'sourceContent',
                                       contentId=relative_path)
            etree.SubElement(content,
                             'mimetype').text = 'application/octet-stream'
            etree.SubElement(content, 'byteSize').text = str(filesize)
            etree.SubElement(content, 'md5').text = checksum.hexdigest()
            chunks = etree.SubElement(root, 'chunks')
            # Split file into chunks
            with open(upload_file, 'rb') as f:
                # If resume, check if chunks already exists
                if resume:
                    chunklist = set(self._get_files_list(relative_path))
                    LOGGER.debug('Chunklist %s', chunklist)
                file_complete = False
                i = 0
                while not file_complete:
                    # Setup chunk info
                    chunk_suffix = '.dura-chunk-' + str(i).zfill(4)
                    chunk_path = upload_file + chunk_suffix
                    LOGGER.debug('Chunk path: %s', chunk_path)
                    chunk_url = url + chunk_suffix
                    LOGGER.debug('Chunk URL: %s', chunk_url)
                    chunkid = relative_path + chunk_suffix
                    LOGGER.debug('Chunk ID: %s', chunkid)
                    try:
                        self._process_chunk(f, chunk_path)
                    except StopIteration:
                        file_complete = True
                    # Make chunk element
                    # <chunk chunkId="chunked/chunked_image.jpg.dura-chunk-0000" index="0">
                    #   <byteSize>2097152</byteSize>
                    #   <md5>ddbb227beaac5a9dc34eb49608997abf</md5>
                    # </chunk>
                    checksum = utils.generate_checksum(chunk_path)
                    chunk_e = etree.SubElement(chunks,
                                               'chunk',
                                               chunkId=chunkid)
                    etree.SubElement(chunk_e, 'byteSize').text = str(
                        os.path.getsize(chunk_path))
                    etree.SubElement(chunk_e,
                                     'md5').text = checksum.hexdigest()
                    # Upload chunk
                    # Check if chunk exists already
                    if resume and chunkid in chunklist:
                        LOGGER.info('%s already in Duracloud, skipping upload',
                                    chunk_path)
                    else:
                        self._upload_chunk(chunk_url, chunk_path)
                    # Delete chunk
                    os.remove(chunk_path)
                    i += 1
            # Write .dura-manifest
            manifest_path = upload_file + self.MANIFEST_SUFFIX
            manifest_url = url + self.MANIFEST_SUFFIX
            with open(manifest_path, 'w') as f:
                f.write(
                    etree.tostring(root,
                                   pretty_print=True,
                                   xml_declaration=True,
                                   encoding='UTF-8'))
            # Upload .dura-manifest
            self._upload_chunk(manifest_url, manifest_path)
            os.remove(manifest_path)
            # TODO what if .dura-manifest over chunksize?
        else:
            # Example URL: https://trial.duracloud.org/durastore/trial261//ts/test.txt
            self._upload_chunk(url, upload_file)
Ejemplo n.º 9
0
    def _download_file(self,
                       url,
                       download_path,
                       expected_size=0,
                       checksum=None):
        """
        Helper to download files from DuraCloud.

        :param url: URL to fetch the file from.
        :param download_path: Absolute path to store the downloaded file at.
        :return: True on success, False if file not found
        :raises: StorageException if response code not 200 or 404
        """
        LOGGER.debug('URL: %s', url)
        response = self.session.get(url)
        LOGGER.debug('Response: %s', response)
        if response.status_code == 404:
            # Check if chunked by looking for a .dura-manifest
            manifest_url = url + self.MANIFEST_SUFFIX
            LOGGER.debug('Manifest URL: %s', manifest_url)
            response = self.session.get(manifest_url)
            LOGGER.debug('Response: %s', response)
            # No manifest - this file does not exist
            if not response.ok:
                return False

            # Get chunks, expected size, checksum
            root = etree.fromstring(response.content)
            expected_size = int(root.findtext('header/sourceContent/byteSize'))
            checksum = root.findtext('header/sourceContent/md5')
            chunk_elements = [e for e in root.findall('chunks/chunk')]
            # Download each chunk and append to original file
            self.space.create_local_directory(download_path)
            LOGGER.debug('Writing to %s', download_path)
            with open(download_path, 'wb') as output_f:
                for e in chunk_elements:
                    # Parse chunk element
                    chunk = e.attrib['chunkId']
                    size = int(e.findtext('byteSize'))
                    md5 = e.findtext('md5')
                    # Download
                    chunk_url = self.duraspace_url + urllib.quote(chunk)
                    LOGGER.debug('Chunk URL: %s', chunk_url)
                    chunk_path = chunk_url.replace(url, download_path)
                    LOGGER.debug('Chunk path: %s', chunk_path)
                    self._download_file(chunk_url, chunk_path, size, md5)
                    # Append to output
                    with open(chunk_path, 'rb') as chunk_f:
                        shutil.copyfileobj(chunk_f, output_f)
                    # Delete chunk_path
                    os.remove(chunk_path)
        elif response.status_code != 200:
            LOGGER.warning('Response: %s when fetching %s', response, url)
            LOGGER.warning('Response text: %s', response.text)
            raise StorageException('Unable to fetch %s' % url)
        else:  # Status code 200 - file exists
            self.space.create_local_directory(download_path)
            LOGGER.debug('Writing to %s', download_path)
            with open(download_path, 'wb') as f:
                f.write(response.content)

        # Verify file, if size or checksum is known
        if expected_size and os.path.getsize(download_path) != expected_size:
            raise StorageException(
                _('File %(path)s does not match expected size of %(expected_size)s bytes, but was actually %(actual_size)s bytes'
                  ), {
                      'path': download_path,
                      'expected_size': expected_size,
                      'actual_size': os.path.getsize(download_path)
                  })
        calculated_checksum = utils.generate_checksum(download_path, 'md5')
        if checksum and checksum != calculated_checksum.hexdigest():
            raise StorageException(
                'File %s does not match expected checksum of %s, but was actually %s',
                download_path, checksum, calculated_checksum.hexdigest())

        return True
Ejemplo n.º 10
0
    def _upload_file(self, url, upload_file):
        """
        Helper to upload files to Duracloud.

        :param url: URL to upload the file to.
        :param upload_file: Absolute path to the file to upload.
        :returns: None
        :raises: StorageException if error storing file
        """
        LOGGER.debug('Upload %s to %s', upload_file, url)
        filesize = os.path.getsize(upload_file)
        if filesize > self.CHUNK_SIZE:
            LOGGER.debug('%s size (%s) larger than %s', upload_file, filesize,
                         self.CHUNK_SIZE)
            # Create manifest info for complete file.  Eg:
            # <header schemaVersion="0.2">
            #   <sourceContent contentId="chunked/chunked_image.jpg">
            #     <mimetype>application/octet-stream</mimetype>
            #     <byteSize>2222135</byteSize>
            #     <md5>9497f70a1a17943ddfcbed567538900d</md5>
            #   </sourceContent>
            # </header>
            relative_path = urllib.unquote(
                url.replace(self.duraspace_url, '', 1))
            LOGGER.debug('File name: %s', relative_path)
            checksum = utils.generate_checksum(upload_file, 'md5')
            root = etree.Element('{duracloud.org}chunksManifest',
                                 nsmap={'dur': 'duracloud.org'})
            header = etree.SubElement(root, 'header', schemaVersion="0.2")
            content = etree.SubElement(header,
                                       'sourceContent',
                                       contentId=relative_path)
            etree.SubElement(content,
                             'mimetype').text = 'application/octet-stream'
            etree.SubElement(content, 'byteSize').text = str(filesize)
            etree.SubElement(content, 'md5').text = checksum.hexdigest()
            chunks = etree.SubElement(root, 'chunks')
            # Split file into chunks
            with open(upload_file, 'rb') as f:
                i = 0
                chunk_data = f.read(self.CHUNK_SIZE)
                while chunk_data:
                    # Setup chunk info
                    chunk_suffix = '.dura-chunk-' + str(i).zfill(4)
                    chunk_path = upload_file + chunk_suffix
                    LOGGER.debug('Chunk path: %s', chunk_path)
                    chunk_url = url + chunk_suffix
                    LOGGER.debug('Chunk URL: %s', chunk_url)
                    chunkid = relative_path + chunk_suffix
                    LOGGER.debug('Chunk ID: %s', chunkid)
                    # Write chunk
                    with open(chunk_path, 'wb') as fchunk:
                        fchunk.write(chunk_data)
                    # Make chunk element
                    # <chunk chunkId="chunked/chunked_image.jpg.dura-chunk-0000" index="0">
                    #   <byteSize>2097152</byteSize>
                    #   <md5>ddbb227beaac5a9dc34eb49608997abf</md5>
                    # </chunk>
                    checksum = utils.generate_checksum(chunk_path)
                    chunk_e = etree.SubElement(chunks,
                                               'chunk',
                                               chunkId=chunkid)
                    etree.SubElement(chunk_e, 'byteSize').text = str(
                        os.path.getsize(chunk_path))
                    etree.SubElement(chunk_e,
                                     'md5').text = checksum.hexdigest()
                    # Upload chunk
                    self._upload_file(chunk_url, chunk_path)
                    # Delete chunk
                    os.remove(chunk_path)
                    # Read next chunk
                    chunk_data = f.read(self.CHUNK_SIZE)
                    i += 1
            # Write .dura-manifest
            manifest_path = upload_file + self.MANIFEST_SUFFIX
            manifest_url = url + self.MANIFEST_SUFFIX
            with open(manifest_path, 'w') as f:
                f.write(
                    etree.tostring(root,
                                   pretty_print=True,
                                   xml_declaration=True,
                                   encoding='UTF-8'))
            # Upload .dura-manifest
            self._upload_file(manifest_url, manifest_path)
            os.remove(manifest_path)
            # TODO what if .dura-manifest over chunksize?
        else:
            # Example URL: https://trial.duracloud.org/durastore/trial261//ts/test.txt
            LOGGER.debug('PUT URL: %s', url)
            with open(upload_file, 'rb') as f:
                response = self.session.put(url, data=f)
            LOGGER.debug('Response: %s', response)
            if response.status_code != 201:
                LOGGER.warning('%s: Response: %s', response, response.text)
                raise StorageException('Unable to store %s' % upload_file)