Beispiel #1
0
def largefile_ingest(request):
    '''Large-file ingest.  On GET, displays a form allowing user to
    select a BagIt that has been uploaded to the configured large-file
    ingest staging area for ingest and association with a collection.
    '''
    # ingest content from upload staging area

    context = {}
    template_name = 'file/largefile_ingest.html'
    form = None

    # on POST, process the form and ingest if valid
    if request.method == 'POST':
        form = LargeFileIngestForm(request.POST)

        # if form is not valid, add to context for redisplay with errors
        if not form.is_valid():
            context['form'] = form

        # otherwise, process the form
        else:
            repo = Repository(request=request)

            # Get collection & check for optional comment
            collection = repo.get_object(pid=form.cleaned_data['collection'],
                                         type=CollectionObject)
            # get user comment if any; default to a generic ingest comment
            comment = form.cleaned_data['comment'] or 'initial repository ingest'
            bag = form.cleaned_data['bag']

            # create dict with file info to add success/failure info
            file_info = {'label': os.path.basename(bag)}

            #assuming type of ingest from subdirectory
            type = bag.split('/')[-2]
            try:

                if type == 'diskimage':
                    obj = DiskImage.init_from_bagit(bag, request)

                elif type == 'video':
                    obj = Video.init_from_bagit(bag, request)

                # set collection on ingest
                obj.collection = collection

                ## NOTE: Due to a bug in Fedora 3.4 with checksums and
                ## and file uri ingest, the content datastream checksum
                ## must be cleared before ingest; manually check it
                ## after ingest to confirm Fedora calculated what we expect.
                ## This work-around can be removed once we upgrade to Fedora 3.6

                # store datastream checksum that would be sent to fedora
                checksum = obj.content.checksum
                obj._content_checksum = checksum
                # clear it out so Fedora can ingest without erroring
                obj.content.checksum = None

                # file URIs also used for supplemental files; needs
                # to be handled the same way as content datastream
                # - look for any supplementN datastreams, store checksum, and remove
                supplemental_checksums = {}
                for i in range(20):
                    try:
                        dsid = 'supplement%d' % i
                        dsobj = getattr(obj, dsid)
                        supplemental_checksums[dsid] = dsobj.checksum
                        dsobj.checksum = None
                    except AttributeError:
                        # stop iterating - we have found last supplemental file
                        break

                # same for access copy checksum on Video files
                if type == 'video':
                    access_checksum = obj.access_copy.checksum
                    obj.access_copy.checksum = None
                
                pids_exists = []
                if type == 'video':
                    pids_exists = repo.find_objects(type=Video, label=obj.label)
                
                if type == 'diskimage':
                    pids_exists = repo.find_objects(type=DiskImage, label=obj.label)

                exists = 0
                for pid in pids_exists:
                    if pid.pid:
                        exists += 1

                if exists == 0:
                    obj.save(comment)
                else:
                    raise ValueError('Duplicate content detected.')


                # remove the ingested bag from large-file staging area
                shutil.rmtree(bag)

                # re-init to allow checking fedora-calculated checksums on
                # supplemental datastreams
                if type == 'diskimage':
                    obj = repo.get_object(obj.pid, type=DiskImage)
                elif type == 'video':
                    obj = repo.get_object(obj.pid, type=Video)

                # if save succeded (no exceptions), set summary info for display
                file_info.update({'type' : type, 'success': True,
                                  'pid': obj.pid, 'url': obj.get_absolute_url(),
                                  'checksum': obj.content.checksum})
                if type == 'video':
                    file_info['access_checksum'] = obj.access_copy.checksum

                # compare checksum generated by Fedora
                # (required because of file uri bug in fedora 3.4;
                #  this can be removed once we upgrade to fedora 3.6+)
                checksum_errors = []

                if obj.content.checksum != checksum:
                    checksum_errors.append('content')

                for dsid, checksum in supplemental_checksums.iteritems():
                    dsobj = obj.getDatastreamObject(dsid)
                    if dsobj.checksum != checksum:
                        checksum_errors.append(dsid)

                if type == 'video' and obj.access_copy.checksum != access_checksum:
                    checksum_errors.append('access_copy')

                if checksum_errors:
                    message = 'Checksum mismatch%s detected on ' + \
                       '%s datastream%s; please contact a repository administrator.'''
                    file_info['message'] = message % (
                        'es' if len(checksum_errors) > 1 else '',
                        ', '.join(checksum_errors),
                        's' if len(checksum_errors) > 1 else ''
                    )

            except bagit.BagValidationError as err:
                logger.error(err)
                file_info.update({'success': False, 'message': 'BagIt error: %s' % err})

            # special case: detected as duplicate content
            except DuplicateContent as e:
                # mark as failed and generate message with links to records
                # NOTE: pid url is duplicated logic from web upload view...
                links = []
                for pid in e.pids:
                    # use fedora type-inferring logic with list of content models
                    # pulled from solr results
                    obj = repo.get_object(pid,
                        type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid]))
                    # use appropriate object class to get the object url
                    links.append('<a href="%s">%s</a>' % (
                        obj.get_absolute_url(), pid)
                    )
                msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
                file_info.update({
                    'success': False,
                    'message': msg
                })

            except Exception as err:
                logger.error('Error: %s' % err)
                file_info.update({'success': False, 'message': '%s' % err})

            # report success/failure in the same format as web-upload ingest
            context['ingest_results'] = [file_info]
            messages.success(request, 'Ingest results: %s' % file_info)
            return HttpResponseRedirect("/admin")

    # on GET display form to select item(s) for ingest
    # OR on completed valid form post
    files = large_file_uploads()
    if request.method == 'GET' or \
      form is not None and form.is_valid():
        if len(files):
            context['form'] = LargeFileIngestForm()
        else:
            # indicator that no files are available for ingest
            context['no_files'] = True

    return TemplateResponse(request, template_name, context)
Beispiel #2
0
def largefile_ingest(request):
    '''Large-file ingest.  On GET, displays a form allowing user to
    select a BagIt that has been uploaded to the configured large-file
    ingest staging area for ingest and association with a collection.
    '''
    # ingest content from upload staging area

    context = {}
    template_name = 'file/largefile_ingest.html'
    form = None

    # on POST, process the form and ingest if valid
    if request.method == 'POST':
        form = LargeFileIngestForm(request.POST)

        # if form is not valid, add to context for redisplay with errors
        if not form.is_valid():
            context['form'] = form

        # otherwise, process the form
        else:
            repo = Repository(request=request)

            # Get collection & check for optional comment
            collection = repo.get_object(pid=form.cleaned_data['collection'],
                                         type=CollectionObject)
            # get user comment if any; default to a generic ingest comment
            comment = form.cleaned_data[
                'comment'] or 'initial repository ingest'
            bag = form.cleaned_data['bag']

            # create dict with file info to add success/failure info
            file_info = {'label': os.path.basename(bag)}

            #assuming type of ingest from subdirectory
            type = bag.split('/')[-2]
            try:

                if type == 'diskimage':
                    obj = DiskImage.init_from_bagit(bag, request)

                if type == 'video':
                    obj = Video.init_from_bagit(bag, request)

                # set collection on ingest
                obj.collection = collection

                ## NOTE: Due to a bug in Fedora 3.4 with checksums and
                ## and file uri ingest, the content datastream checksum
                ## must be cleared before ingest; manually check it
                ## after ingest to confirm Fedora calculated what we expect.
                ## This work-around can be removed once we upgrade to Fedora 3.6

                # store datastream checksum that would be sent to fedora
                checksum = obj.content.checksum
                obj._content_checksum = checksum
                # clear it out so Fedora can ingest without erroring
                obj.content.checksum = None

                # file URIs also used for supplemental files; needs
                # to be handled the same way as content datastream
                # - look for any supplementN datastreams, store checksum, and remove
                supplemental_checksums = {}
                for i in range(20):
                    try:
                        dsid = 'supplement%d' % i
                        dsobj = getattr(obj, dsid)
                        supplemental_checksums[dsid] = dsobj.checksum
                        dsobj.checksum = None
                    except AttributeError:
                        # stop iterating - we have found last supplemental file
                        break

                # same for access copy checksum on Video files
                if type == 'video':
                    access_checksum = obj.access_copy.checksum
                    obj.access_copy.checksum = None

                obj.save(comment)

                # remove the ingested bag from large-file staging area
                shutil.rmtree(bag)

                # re-init to allow checking fedora-calculated checksums on
                # supplemental datastreams
                if type == 'diskimage':
                    obj = repo.get_object(obj.pid, type=DiskImage)
                elif type == 'video':
                    obj = repo.get_object(obj.pid, type=Video)

                # if save succeded (no exceptions), set summary info for display
                file_info.update({
                    'type': type,
                    'success': True,
                    'pid': obj.pid,
                    'url': obj.get_absolute_url(),
                    'checksum': obj.content.checksum
                })
                if type == 'video':
                    file_info['access_checksum'] = obj.access_copy.checksum

                # compare checksum generated by Fedora
                # (required because of file uri bug in fedora 3.4;
                #  this can be removed once we upgrade to fedora 3.6+)
                checksum_errors = []

                if obj.content.checksum != checksum:
                    checksum_errors.append('content')

                for dsid, checksum in supplemental_checksums.iteritems():
                    dsobj = obj.getDatastreamObject(dsid)
                    if dsobj.checksum != checksum:
                        checksum_errors.append(dsid)

                if type == 'video' and obj.access_copy.checksum != access_checksum:
                    checksum_errors.append('access_copy')

                if checksum_errors:
                    message = 'Checksum mismatch%s detected on ' + \
                       '%s datastream%s; please contact a repository administrator.'''
                    file_info['message'] = message % (
                        'es' if len(checksum_errors) > 1 else '',
                        ', '.join(checksum_errors),
                        's' if len(checksum_errors) > 1 else '')

            except bagit.BagValidationError as err:
                logger.error(err)
                file_info.update({
                    'success': False,
                    'message': 'BagIt error: %s' % err
                })

            # special case: detected as duplicate content
            except DuplicateContent as e:
                # mark as failed and generate message with links to records
                # NOTE: pid url is duplicated logic from web upload view...
                links = []
                for pid in e.pids:
                    # use fedora type-inferring logic with list of content models
                    # pulled from solr results
                    obj = repo.get_object(pid,
                                          type=repo.best_subtype_for_object(
                                              pid, e.pid_cmodels[pid]))
                    # use appropriate object class to get the object url
                    links.append('<a href="%s">%s</a>' %
                                 (obj.get_absolute_url(), pid))
                msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
                file_info.update({'success': False, 'message': msg})

            except Exception as err:
                logger.error('Error: %s' % err)
                file_info.update({'success': False, 'message': '%s' % err})

            # report success/failure in the same format as web-upload ingest
            context['ingest_results'] = [file_info]

    # on GET display form to select item(s) for ingest
    # OR on completed valid form post
    files = large_file_uploads()
    if request.method == 'GET' or \
      form is not None and form.is_valid():
        if len(files):
            context['form'] = LargeFileIngestForm()
        else:
            # indicator that no files are available for ingest
            context['no_files'] = True

    return TemplateResponse(request, template_name, context)
Beispiel #3
0
def ingest_files(files, collection, comment, request):
    '''Ingest a dictionary of files as returned by
    :meth:`keep.files.forms.UploadForm.files_to_ingest`.
    Returns a dictionary reporting per-file ingest success or failure.

    :param files: dictionary of files to be ingested
    :param collection: :class:`~keep.collection.models.CollectionObject` that
        newly ingested objects should be associated with
    :param comment: save message for fedora ingest
    :param request: :class:`~django.http.HttpRequest`, to access Fedora and
        ingest new objects as the logged-in user.
    '''

    # NOTE: using this structure for easy of display in django templates (e.g., regroup)
    results = []

    m = magic.Magic(mime=True)
    for filename, label in files.iteritems():

        file_info = {'label': label}

        # check if file is an allowed type

        # NOTE: for single-file upload, browser-set type is
        # available as UploadedFile.content_type - but since
        # browser mimetypes are unreliable, calculate anyway
        try:
            type = m.from_file(filename)
        except IOError:
            raise Exception('Uploaded file is no longer available for ingest; please try again.')

        type, separator, options = type.partition(';')
        if type not in allowed_upload_types(request.user):
            # store error for display on detailed result page
            file_info.update({'success': False,
                              'message': '''File type '%s' is not allowed''' % type})
            # if not an allowed type, no further processing
            results.append(file_info)
            continue

        if collection is None:
            file_info.update({'success': False,
                              'message': '''Collection not selected'''})
            results.append(file_info)
            continue

        # if there is an MD5 file (i.e., file was uploaded via ajax),
        # use the contents of that file as checksum
        if os.path.exists(filename + '.md5'):
            with open(filename + '.md5') as md5file:
                md5 = md5file.read()
        # otherwise, calculate the MD5 (single-file upload)
        else:
            md5 = md5sum(filename)

        # determine what type of object to initialize based on mimetype
        objtype = None
        for t in uploadable_objects:
            if type in t.allowed_mimetypes:
                objtype = t
                break

        # initialize a new object from the file
        obj = objtype.init_from_file(filename, initial_label=label,
                                     request=request, checksum=md5,
                                     mimetype=type)

        # set collection on ingest
        obj.collection = collection

        try:
            # NOTE: by sending a log message, we force Fedora to store an
            # audit trail entry for object creation, which doesn't happen otherwise
            obj.save(comment)
            file_info.update({'success': True, 'pid': obj.pid,
                              'url': obj.get_absolute_url(),
                              'checksum': md5})

            # if audio, needs an additional step:
            if objtype == AudioObject:
                # Start asynchronous task to convert audio for access
                # NOTE: not passing in user-upload file so that
                # celery can more easily be run on a separate server
                queue_access_copy(obj)
                # remove the file now that we have sucessfully ingested
                os.remove(filename)

            # NOTE: could remove MD5 file (if any) here, but MD5 files
            # should be small and will get cleaned up by the cron script

        # special case: detected as duplicate content
        except DuplicateContent as e:
            # mark as failed and generate message with links to records
            links = []
            repo = Repository(request=request)
            for pid in e.pids:
                # use fedora type-inferring logic with list of content models
                # pulled from solr results
                obj = repo.get_object(pid,
                    type=repo.best_subtype_for_object(pid, e.pid_cmodels[pid]))
                # use appropriate object class to get the object url
                links.append('<a href="%s">%s</a>' % (
                    obj.get_absolute_url(), pid)
                )

            msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
            file_info.update({
                'success': False,
                'message': msg
            })

        except Exception as e:
            logger.error('Error ingesting %s: %s' % (filename, e))
            logger.debug("Error details:\n" + traceback.format_exc())
            file_info['success'] = False

            # check for Fedora-specific errors
            if isinstance(e, RequestFailed):
                if 'Checksum Mismatch' in e.detail:
                    file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \
                        'file may have been corrupted or incompletely uploaded to Fedora'
                else:
                    file_info['message'] = 'Fedora error: ' + unicode(e)

            # non-fedora error
            else:
                file_info['message'] = 'Ingest failed: ' + unicode(e)

        finally:
            # no matter what happened, store results for reporting to user
            results.append(file_info)

    return results
Beispiel #4
0
def ingest_files(files, collection, comment, request):
    '''Ingest a dictionary of files as returned by
    :meth:`keep.files.forms.UploadForm.files_to_ingest`.
    Returns a dictionary reporting per-file ingest success or failure.

    :param files: dictionary of files to be ingested
    :param collection: :class:`~keep.collection.models.CollectionObject` that
        newly ingested objects should be associated with
    :param comment: save message for fedora ingest
    :param request: :class:`~django.http.HttpRequest`, to access Fedora and
        ingest new objects as the logged-in user.
    '''

    # NOTE: using this structure for easy of display in django templates (e.g., regroup)
    results = []

    m = magic.Magic(mime=True)
    for filename, label in files.iteritems():

        file_info = {'label': label}

        # check if file is an allowed type

        # NOTE: for single-file upload, browser-set type is
        # available as UploadedFile.content_type - but since
        # browser mimetypes are unreliable, calculate anyway
        try:
            type = m.from_file(filename)
        except IOError:
            raise Exception(
                'Uploaded file is no longer available for ingest; please try again.'
            )

        type, separator, options = type.partition(';')
        if type not in allowed_upload_types(request.user):
            # store error for display on detailed result page
            file_info.update({
                'success':
                False,
                'message':
                '''File type '%s' is not allowed''' % type
            })
            # if not an allowed type, no further processing
            results.append(file_info)
            continue

        if collection is None:
            file_info.update({
                'success': False,
                'message': '''Collection not selected'''
            })
            results.append(file_info)
            continue

        # if there is an MD5 file (i.e., file was uploaded via ajax),
        # use the contents of that file as checksum
        if os.path.exists(filename + '.md5'):
            with open(filename + '.md5') as md5file:
                md5 = md5file.read()
        # otherwise, calculate the MD5 (single-file upload)
        else:
            md5 = md5sum(filename)

        # determine what type of object to initialize based on mimetype
        objtype = None
        for t in uploadable_objects:
            if type in t.allowed_mimetypes:
                objtype = t
                break

        # initialize a new object from the file
        obj = objtype.init_from_file(filename,
                                     initial_label=label,
                                     request=request,
                                     checksum=md5,
                                     mimetype=type)

        # set collection on ingest
        obj.collection = collection

        try:
            # NOTE: by sending a log message, we force Fedora to store an
            # audit trail entry for object creation, which doesn't happen otherwise
            obj.save(comment)
            file_info.update({
                'success': True,
                'pid': obj.pid,
                'url': obj.get_absolute_url(),
                'checksum': md5
            })

            # if audio, needs an additional step:
            if objtype == AudioObject:
                # Start asynchronous task to convert audio for access
                # NOTE: not passing in user-upload file so that
                # celery can more easily be run on a separate server
                queue_access_copy(obj)
                # remove the file now that we have sucessfully ingested
                os.remove(filename)

            # NOTE: could remove MD5 file (if any) here, but MD5 files
            # should be small and will get cleaned up by the cron script

        # special case: detected as duplicate content
        except DuplicateContent as e:
            # mark as failed and generate message with links to records
            links = []
            repo = Repository(request=request)
            for pid in e.pids:
                # use fedora type-inferring logic with list of content models
                # pulled from solr results
                obj = repo.get_object(pid,
                                      type=repo.best_subtype_for_object(
                                          pid, e.pid_cmodels[pid]))
                # use appropriate object class to get the object url
                links.append('<a href="%s">%s</a>' %
                             (obj.get_absolute_url(), pid))

            msg = mark_safe('%s: %s' % (unicode(e), '; '.join(links)))
            file_info.update({'success': False, 'message': msg})

        except Exception as e:
            logger.error('Error ingesting %s: %s' % (filename, e))
            logger.debug("Error details:\n" + traceback.format_exc())
            file_info['success'] = False

            # check for Fedora-specific errors
            if isinstance(e, RequestFailed):
                if 'Checksum Mismatch' in e.detail:
                    file_info['message'] = 'Ingest failed due to a checksum mismatch - ' + \
                        'file may have been corrupted or incompletely uploaded to Fedora'
                else:
                    file_info['message'] = 'Fedora error: ' + unicode(e)

            # non-fedora error
            else:
                file_info['message'] = 'Ingest failed: ' + unicode(e)

        finally:
            # no matter what happened, store results for reporting to user
            results.append(file_info)

    return results